Skip to content

Commit 71cf61e

Browse files
authored
feat: support set spark_config and use to request taskmanager (#3613)
* Support set spark_config and use to request taskmanager * Format cpp code * Format cpp string
1 parent b005cd1 commit 71cf61e

File tree

4 files changed

+71
-2
lines changed

4 files changed

+71
-2
lines changed

docs/en/reference/sql/ddl/SET_STATEMENT.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ The following format is also equivalent.
3434
| @@session.enable_trace|@@enable_trace | When the value is `true`, an error message stack will be printed when the SQL statement has a syntax error or an error occurs during the plan generation process. <br />When the value is `false`, only the basic error message will be printed if there is a SQL syntax error or an error occurs during the plan generation process. | `true`, <br /> `false` | `false` |
3535
| @@session.sync_job|@@sync_job | When the value is `true`, the offline command will be executed synchronously, waiting for the final result of the execution.<br />When the value is `false`, the offline command returns immediately. If you need to check the execution, please use `SHOW JOB` command. | `true`, <br /> `false` | `false` |
3636
| @@session.sync_timeout|@@sync_timeout | When `sync_job=true`, you can configure the waiting time for synchronization commands. The timeout will return immediately. After the timeout returns, you can still view the command execution through `SHOW JOB`. | Int | 20000 |
37+
| @@session.spark_config|@@spark_config | Set the Spark configuration for offline jobs, configure like 'spark.executor.memory=2g;spark.executor.cores=2'. Notice that the priority of this Spark configuration is higer than TaskManager Spark configuration but lower than CLI Spark configuration file. | String | "" |
3738

3839
## Example
3940

docs/zh/openmldb_sql/ddl/SET_STATEMENT.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ sessionVariableName ::= '@@'Identifier | '@@session.'Identifier | '@@global.'Ide
3535
| @@session.enable_trace|@@enable_trace | 当该变量值为 `true`,SQL语句有语法错误或者在计划生成过程发生错误时,会打印错误信息栈。<br />当该变量值为 `false`,SQL语句有语法错误或者在计划生成过程发生错误时,仅打印基本错误信息。 | "true" \| "false" | "false" |
3636
| @@session.sync_job|@@sync_job | 当该变量值为 `true`,离线的命令将变为同步,等待执行的最终结果。<br />当该变量值为 `false`,离线的命令即时返回,若要查看命令的执行情况,请使用`SHOW JOB`| "true" \| "false" | "false" |
3737
| @@session.job_timeout|@@job_timeout | 可配置离线异步命令或离线管理命令的等待时间(以*毫秒*为单位),将立即返回。离线异步命令返回后仍可通过`SHOW JOB`查看命令执行情况。 | Int | "20000" |
38-
38+
| @@session.spark_config|@@spark_config | 设置离线任务的 Spark 参数,配置项参考 'spark.executor.memory=2g;spark.executor.cores=2'。注意此 Spark 配置优先级高于 TaskManager 默认 Spark 配置,低于命令行的 Spark 配置文件。 | String | "" |
3939
## Example
4040

4141
### 设置和显示会话系统变量

src/sdk/sql_cluster_router.cc

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,10 @@
1919
#include <algorithm>
2020
#include <fstream>
2121
#include <future>
22+
#include <iostream>
2223
#include <memory>
2324
#include <string>
25+
#include <sstream>
2426
#include <unordered_map>
2527
#include <utility>
2628

@@ -319,6 +321,7 @@ bool SQLClusterRouter::Init() {
319321
session_variables_.emplace("enable_trace", "false");
320322
session_variables_.emplace("sync_job", "false");
321323
session_variables_.emplace("job_timeout", "60000"); // rpc request timeout for taskmanager
324+
session_variables_.emplace("spark_config", "");
322325
}
323326
return true;
324327
}
@@ -2980,7 +2983,7 @@ std::shared_ptr<hybridse::sdk::ResultSet> SQLClusterRouter::ExecuteOfflineQuery(
29802983
bool is_sync_job, int job_timeout,
29812984
::hybridse::sdk::Status* status) {
29822985
RET_IF_NULL_AND_WARN(status, "output status is nullptr");
2983-
std::map<std::string, std::string> config;
2986+
std::map<std::string, std::string> config = ParseSparkConfigString(GetSparkConfig());
29842987
ReadSparkConfFromFile(std::dynamic_pointer_cast<SQLRouterOptions>(options_)->spark_conf_path, &config);
29852988

29862989
if (is_sync_job) {
@@ -3049,6 +3052,16 @@ int SQLClusterRouter::GetJobTimeout() {
30493052
return 60000;
30503053
}
30513054

3055+
std::string SQLClusterRouter::GetSparkConfig() {
3056+
std::lock_guard<::openmldb::base::SpinMutex> lock(mu_);
3057+
auto it = session_variables_.find("spark_config");
3058+
if (it != session_variables_.end()) {
3059+
return it->second;
3060+
}
3061+
3062+
return "";
3063+
}
3064+
30523065
::hybridse::sdk::Status SQLClusterRouter::SetVariable(hybridse::node::SetPlanNode* node) {
30533066
std::string key = node->Key();
30543067
std::transform(key.begin(), key.end(), key.begin(), ::tolower);
@@ -3083,13 +3096,34 @@ ::hybridse::sdk::Status SQLClusterRouter::SetVariable(hybridse::node::SetPlanNod
30833096
if (!absl::SimpleAtoi(value, &new_timeout)) {
30843097
return {StatusCode::kCmdError, "Fail to parse value, can't set the request timeout"};
30853098
}
3099+
} else if (key == "spark_config") {
3100+
if (!CheckSparkConfigString(value)) {
3101+
return {
3102+
StatusCode::kCmdError,
3103+
"Fail to parse spark config, set like 'spark.executor.memory=2g;spark.executor.cores=2'"
3104+
};
3105+
}
30863106
} else {
30873107
return {};
30883108
}
30893109
session_variables_[key] = value;
30903110
return {};
30913111
}
30923112

3113+
bool SQLClusterRouter::CheckSparkConfigString(const std::string& input) {
3114+
std::istringstream iss(input);
3115+
std::string keyValue;
3116+
3117+
while (std::getline(iss, keyValue, ';')) {
3118+
// Check if the substring starts with "spark."
3119+
if (keyValue.find("spark.") != 0) {
3120+
return false;
3121+
}
3122+
}
3123+
3124+
return true;
3125+
}
3126+
30933127
::hybridse::sdk::Status SQLClusterRouter::ParseNamesFromArgs(const std::string& db,
30943128
const std::vector<std::string>& args, std::string* db_name, std::string* name) {
30953129
if (args.size() == 1) {
@@ -4523,6 +4557,34 @@ bool SQLClusterRouter::CheckTableStatus(const std::string& db, const std::string
45234557
return check_succeed;
45244558
}
45254559

4560+
std::map<std::string, std::string> SQLClusterRouter::ParseSparkConfigString(const std::string& input) {
4561+
std::map<std::string, std::string> configMap;
4562+
4563+
std::istringstream iss(input);
4564+
std::string keyValue;
4565+
4566+
while (std::getline(iss, keyValue, ';')) {
4567+
// Split the key-value pair
4568+
size_t equalPos = keyValue.find('=');
4569+
if (equalPos != std::string::npos) {
4570+
std::string key = keyValue.substr(0, equalPos);
4571+
std::string value = keyValue.substr(equalPos + 1);
4572+
4573+
// Check if the key starts with "spark."
4574+
if (key.find("spark.") == 0) {
4575+
// Add to the map
4576+
configMap[key] = value;
4577+
} else {
4578+
std::cerr << "Error: Key does not start with 'spark.' - " << key << std::endl;
4579+
}
4580+
} else {
4581+
std::cerr << "Error: Invalid key-value pair - " << keyValue << std::endl;
4582+
}
4583+
}
4584+
4585+
return configMap;
4586+
}
4587+
45264588
void SQLClusterRouter::ReadSparkConfFromFile(std::string conf_file_path, std::map<std::string, std::string>* config) {
45274589
if (!conf_file_path.empty()) {
45284590
boost::property_tree::ptree pt;

src/sdk/sql_cluster_router.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,12 @@ class SQLClusterRouter : public SQLRouter {
283283
// get job timeout from the session variables, we will use the timeout when sending requests to the taskmanager
284284
int GetJobTimeout();
285285

286+
std::string GetSparkConfig();
287+
288+
std::map<std::string, std::string> ParseSparkConfigString(const std::string& input);
289+
290+
bool CheckSparkConfigString(const std::string& input);
291+
286292
::openmldb::base::Status ExecuteOfflineQueryAsync(const std::string& sql,
287293
const std::map<std::string, std::string>& config,
288294
const std::string& default_db, int job_timeout,

0 commit comments

Comments
 (0)