From 78f6e34d89a458fd677ea4184259305eb0cebc19 Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Wed, 25 Apr 2018 13:28:19 -0400 Subject: [PATCH 01/42] Add more information to CopyStatement. Cleaned up includes. --- src/binder/bind_node_visitor.cpp | 12 ++++++++- src/codegen/translator_factory.cpp | 1 + src/common/internal_types.cpp | 30 +++++++++++++++++++++ src/include/common/internal_types.h | 9 +++++++ src/include/parser/copy_statement.h | 35 ++++++++++++++++-------- src/include/parser/postgresparser.h | 2 +- src/include/planner/abstract_plan.h | 2 +- src/include/planner/copy_plan.h | 18 ++++--------- src/optimizer/util.cpp | 8 +++--- src/parser/copy_statement.cpp | 2 +- src/parser/postgresparser.cpp | 42 ++++++++++++++++++++++------- test/parser/postgresparser_test.cpp | 1 + test/trigger/trigger_test.cpp | 3 ++- 13 files changed, 122 insertions(+), 43 deletions(-) diff --git a/src/binder/bind_node_visitor.cpp b/src/binder/bind_node_visitor.cpp index a6ffe17b322..c7d25093beb 100644 --- a/src/binder/bind_node_visitor.cpp +++ b/src/binder/bind_node_visitor.cpp @@ -166,7 +166,17 @@ void BindNodeVisitor::Visit(parser::DeleteStatement *node) { } void BindNodeVisitor::Visit(parser::LimitDescription *) {} -void BindNodeVisitor::Visit(parser::CopyStatement *) {} + +void BindNodeVisitor::Visit(parser::CopyStatement *node) { + // Bind the source/target table of the copy + context_ = std::make_shared(nullptr); + if (node->table != nullptr) { + node->table->Accept(this); + } else { + node->select_stmt->Accept(this); + } +} + void BindNodeVisitor::Visit(parser::CreateFunctionStatement *) {} void BindNodeVisitor::Visit(parser::CreateStatement *node) { node->TryBindDatabaseName(default_database_name_); diff --git a/src/codegen/translator_factory.cpp b/src/codegen/translator_factory.cpp index 6fe1f65fce6..f10fd863033 100644 --- a/src/codegen/translator_factory.cpp +++ b/src/codegen/translator_factory.cpp @@ -37,6 +37,7 @@ #include "expression/case_expression.h" #include "expression/comparison_expression.h" #include "expression/conjunction_expression.h" +#include "expression/constant_value_expression.h" #include "expression/function_expression.h" #include "expression/operator_expression.h" #include "expression/tuple_value_expression.h" diff --git a/src/common/internal_types.cpp b/src/common/internal_types.cpp index b93da7f3b13..1ab2ed393b3 100644 --- a/src/common/internal_types.cpp +++ b/src/common/internal_types.cpp @@ -1877,6 +1877,36 @@ std::ostream &operator<<(std::ostream &os, const CopyType &type) { return os; } +//===--------------------------------------------------------------------===// +// ExternalFileFormat - String Utilities +//===--------------------------------------------------------------------===// + +std::string ExternalFileFormatToString(ExternalFileFormat format) { + switch (format) { + case ExternalFileFormat::CSV: + return "CSV"; + case ExternalFileFormat::BINARY: + default: + return "BINARY"; + } +} + +ExternalFileFormat StringToExternalFileFormat(const std::string &str) { + auto upper = StringUtil::Upper(str); + if (upper == "CSV") { + return ExternalFileFormat::CSV; + } else if (upper == "BINARY") { + return ExternalFileFormat::BINARY; + } + throw ConversionException(StringUtil::Format( + "No ExternalFileFormat for input '%s'", upper.c_str())); +} + +std::ostream &operator<<(std::ostream &os, const ExternalFileFormat &format) { + os << ExternalFileFormatToString(format); + return os; +} + //===--------------------------------------------------------------------===// // PayloadType - String Utilities //===--------------------------------------------------------------------===// diff --git a/src/include/common/internal_types.h b/src/include/common/internal_types.h index 995a92cea2d..6c32b9665d0 100644 --- a/src/include/common/internal_types.h +++ b/src/include/common/internal_types.h @@ -556,6 +556,7 @@ enum class PlanNodeType { // Scan Nodes SEQSCAN = 10, INDEXSCAN = 11, + CSVSCAN = 12, // Join Nodes NESTLOOP = 20, @@ -817,6 +818,14 @@ std::string CopyTypeToString(CopyType type); CopyType StringToCopyType(const std::string &str); std::ostream &operator<<(std::ostream &os, const CopyType &type); +enum class ExternalFileFormat { + CSV, + BINARY +}; +std::string ExternalFileFormatToString(ExternalFileFormat format); +ExternalFileFormat StringToExternalFileFormat(const std::string &str); +std::ostream &operator<<(std::ostream &os, const ExternalFileFormat &format); + //===--------------------------------------------------------------------===// // Payload Types //===--------------------------------------------------------------------===// diff --git a/src/include/parser/copy_statement.h b/src/include/parser/copy_statement.h index 3af77a797c4..92100e312d1 100644 --- a/src/include/parser/copy_statement.h +++ b/src/include/parser/copy_statement.h @@ -2,19 +2,19 @@ // // Peloton // -// statement_import.h +// copy_statement.h // -// Identification: src/include/parser/statement_import.h +// Identification: src/include/parser/copy_statement.h // -// Copyright (c) 2015-16, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// #pragma once +#include "parser/select_statement.h" #include "parser/sql_statement.h" #include "parser/table_ref.h" -#include "expression/constant_value_expression.h" #include "common/sql_node_visitor.h" namespace peloton { @@ -26,25 +26,38 @@ namespace parser { */ class CopyStatement : public SQLStatement { public: - CopyStatement(CopyType type) + CopyStatement() : SQLStatement(StatementType::COPY), - cpy_table(nullptr), - type(type), - delimiter(','){}; + table(nullptr), + type(), + delimiter(',') {} - virtual ~CopyStatement() {} + ~CopyStatement() = default; - virtual void Accept(SqlNodeVisitor *v) override { v->Visit(this); } + void Accept(SqlNodeVisitor *v) override { v->Visit(this); } const std::string GetInfo(int num_indent) const override; const std::string GetInfo() const override; - std::unique_ptr cpy_table; + ////////////////////////////////////////////////////////////////////////////// + /// + /// Public member fields + /// + ////////////////////////////////////////////////////////////////////////////// + + std::unique_ptr table; + + std::unique_ptr select_stmt; CopyType type; std::string file_path; + + ExternalFileFormat format; + + bool is_from; + char delimiter; }; diff --git a/src/include/parser/postgresparser.h b/src/include/parser/postgresparser.h index decd43d9ee7..388623a138c 100644 --- a/src/include/parser/postgresparser.h +++ b/src/include/parser/postgresparser.h @@ -6,7 +6,7 @@ // // Identification: src/include/parser/postgresparser.h // -// Copyright (c) 2015-16, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// diff --git a/src/include/planner/abstract_plan.h b/src/include/planner/abstract_plan.h index 2cb5e89ac49..c257b20d830 100644 --- a/src/include/planner/abstract_plan.h +++ b/src/include/planner/abstract_plan.h @@ -6,7 +6,7 @@ // // Identification: src/include/planner/abstract_plan.h // -// Copyright (c) 2015-18, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// diff --git a/src/include/planner/copy_plan.h b/src/include/planner/copy_plan.h index 079199cf755..fcb991b1666 100644 --- a/src/include/planner/copy_plan.h +++ b/src/include/planner/copy_plan.h @@ -6,43 +6,35 @@ // // Identification: src/include/planner/copy_plan.h // -// Copyright (c) 2015-16, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// #pragma once -#include "../parser/copy_statement.h" -#include "../parser/select_statement.h" #include "planner/abstract_plan.h" namespace peloton { namespace storage { class DataTable; -} - -namespace parser { -class CopyStatement; -} +} // namespace storage namespace planner { class CopyPlan : public AbstractPlan { public: - CopyPlan() = delete; - explicit CopyPlan(std::string file_path, bool deserialize_parameters) : file_path(file_path), deserialize_parameters(deserialize_parameters) { LOG_DEBUG("Creating a Copy Plan"); } - inline PlanNodeType GetPlanNodeType() const { return PlanNodeType::COPY; } + PlanNodeType GetPlanNodeType() const override { return PlanNodeType::COPY; } - const std::string GetInfo() const { return "CopyPlan"; } + const std::string GetInfo() const override { return "CopyPlan"; } // TODO: Implement copy mechanism - std::unique_ptr Copy() const { return nullptr; } + std::unique_ptr Copy() const override { return nullptr; } // The path of the target file std::string file_path; diff --git a/src/optimizer/util.cpp b/src/optimizer/util.cpp index 0d01e35e8ac..b0129484442 100644 --- a/src/optimizer/util.cpp +++ b/src/optimizer/util.cpp @@ -144,7 +144,7 @@ bool ContainsJoinColumns(const std::unordered_set &l_group_alias, std::unique_ptr CreateCopyPlan( parser::CopyStatement *copy_stmt) { - std::string table_name(copy_stmt->cpy_table->GetTableName()); + std::string table_name(copy_stmt->table->GetTableName()); bool deserialize_parameters = false; // If we're copying the query metric table, then we need to handle the @@ -160,9 +160,9 @@ std::unique_ptr CreateCopyPlan( auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); auto txn = txn_manager.BeginTransaction(); auto target_table = catalog::Catalog::GetInstance()->GetTableWithName( - copy_stmt->cpy_table->GetDatabaseName(), - copy_stmt->cpy_table->GetSchemaName(), - copy_stmt->cpy_table->GetTableName(), txn); + copy_stmt->table->GetDatabaseName(), + copy_stmt->table->GetSchemaName(), + copy_stmt->table->GetTableName(), txn); txn_manager.CommitTransaction(txn); std::unique_ptr select_plan( diff --git a/src/parser/copy_statement.cpp b/src/parser/copy_statement.cpp index b39fcbc8782..e4c5cd3d621 100644 --- a/src/parser/copy_statement.cpp +++ b/src/parser/copy_statement.cpp @@ -20,7 +20,7 @@ const std::string CopyStatement::GetInfo(int num_indent) const { os << StringUtil::Indent(num_indent) << "CopyStatement\n"; os << StringUtil::Indent(num_indent + 1) << "-> Type :: " << CopyTypeToString(type) << "\n"; - os << cpy_table.get()->GetInfo(num_indent + 1) << std::endl; + os << table.get()->GetInfo(num_indent + 1) << std::endl; os << StringUtil::Indent(num_indent + 1) << "-> File Path :: " << file_path << std::endl; diff --git a/src/parser/postgresparser.cpp b/src/parser/postgresparser.cpp index 797b77406b5..ffbea10e39d 100644 --- a/src/parser/postgresparser.cpp +++ b/src/parser/postgresparser.cpp @@ -1505,19 +1505,41 @@ parser::PrepareStatement *PostgresParser::PrepareTransform(PrepareStmt *root) { return result; } -// TODO: Only support COPY TABLE TO FILE and DELIMITER option parser::CopyStatement *PostgresParser::CopyTransform(CopyStmt *root) { - auto result = new CopyStatement(peloton::CopyType::EXPORT_OTHER); - result->cpy_table.reset(RangeVarTransform(root->relation)); - result->file_path = root->filename; - for (auto cell = root->options->head; cell != NULL; cell = cell->next) { - auto def_elem = reinterpret_cast(cell->data.ptr_value); - if (strcmp(def_elem->defname, "delimiter") == 0) { - auto delimiter = reinterpret_cast(def_elem->arg)->val.str; - result->delimiter = *delimiter; - break; + static constexpr char kDelimiterTok[] = "delimiter"; + static constexpr char kFormatTok[] = "format"; + + // The main return value + auto *result = new CopyStatement(); + + if (root->relation) { + result->table.reset(RangeVarTransform(root->relation)); + } else { + result->select_stmt.reset( + SelectTransform(reinterpret_cast(root->query))); + } + + result->file_path = (root->filename != nullptr ? root->filename : ""); + result->is_from = root->is_from; + + // Handle options + ListCell *cell = nullptr; + for_each_cell(cell, root->options->head) { + auto *def_elem = reinterpret_cast(cell->data.ptr_value); + + // Check delimiter + if (strncmp(def_elem->defname, kDelimiterTok, sizeof(kDelimiterTok)) == 0) { + auto *delimiter_val = reinterpret_cast(def_elem->arg); + result->delimiter = *delimiter_val->val.str; + } + + // Check format + if (strncmp(def_elem->defname, kFormatTok, sizeof(kFormatTok)) == 0) { + auto *format_val = reinterpret_cast(def_elem->arg); + result->format = StringToExternalFileFormat(format_val->val.str); } } + return result; } diff --git a/test/parser/postgresparser_test.cpp b/test/parser/postgresparser_test.cpp index 36910bdc9a9..dee0d981491 100644 --- a/test/parser/postgresparser_test.cpp +++ b/test/parser/postgresparser_test.cpp @@ -17,6 +17,7 @@ #include "common/internal_types.h" #include "common/logger.h" #include "common/macros.h" +#include "expression/constant_value_expression.h" #include "expression/function_expression.h" #include "expression/operator_expression.h" #include "expression/tuple_value_expression.h" diff --git a/test/trigger/trigger_test.cpp b/test/trigger/trigger_test.cpp index 4dacd00cc1d..7238d7f9e6d 100644 --- a/test/trigger/trigger_test.cpp +++ b/test/trigger/trigger_test.cpp @@ -10,17 +10,18 @@ // //===----------------------------------------------------------------------===// -#include "trigger/trigger.h" #include "catalog/catalog.h" #include "common/harness.h" #include "concurrency/transaction_manager_factory.h" #include "executor/executors.h" #include "executor/executor_context.h" +#include "expression/constant_value_expression.h" #include "parser/pg_trigger.h" #include "parser/postgresparser.h" #include "planner/create_plan.h" #include "planner/insert_plan.h" #include "storage/abstract_table.h" +#include "trigger/trigger.h" namespace peloton { namespace test { From ab08eba35db21cdf9eb688791d1d2f912d6b59a0 Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Thu, 26 Apr 2018 01:55:20 -0400 Subject: [PATCH 02/42] Add CSVScan node to ToString and FromString. Removed BINARY external format for now. --- src/common/internal_types.cpp | 11 ++++++----- src/include/common/internal_types.h | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/common/internal_types.cpp b/src/common/internal_types.cpp index 1ab2ed393b3..427e9848e25 100644 --- a/src/common/internal_types.cpp +++ b/src/common/internal_types.cpp @@ -1304,6 +1304,9 @@ std::string PlanNodeTypeToString(PlanNodeType type) { case PlanNodeType::INDEXSCAN: { return ("INDEXSCAN"); } + case PlanNodeType::CSVSCAN: { + return ("CSVSCAN"); + } case PlanNodeType::NESTLOOP: { return ("NESTLOOP"); } @@ -1408,6 +1411,8 @@ PlanNodeType StringToPlanNodeType(const std::string &str) { return PlanNodeType::SEQSCAN; } else if (upper_str == "INDEXSCAN") { return PlanNodeType::INDEXSCAN; + } else if (upper_str == "CSVSCAN") { + return PlanNodeType::CSVSCAN; } else if (upper_str == "NESTLOOP") { return PlanNodeType::NESTLOOP; } else if (upper_str == "NESTLOOPINDEX") { @@ -1884,10 +1889,8 @@ std::ostream &operator<<(std::ostream &os, const CopyType &type) { std::string ExternalFileFormatToString(ExternalFileFormat format) { switch (format) { case ExternalFileFormat::CSV: - return "CSV"; - case ExternalFileFormat::BINARY: default: - return "BINARY"; + return "CSV"; } } @@ -1895,8 +1898,6 @@ ExternalFileFormat StringToExternalFileFormat(const std::string &str) { auto upper = StringUtil::Upper(str); if (upper == "CSV") { return ExternalFileFormat::CSV; - } else if (upper == "BINARY") { - return ExternalFileFormat::BINARY; } throw ConversionException(StringUtil::Format( "No ExternalFileFormat for input '%s'", upper.c_str())); diff --git a/src/include/common/internal_types.h b/src/include/common/internal_types.h index 6c32b9665d0..4654ec9bc77 100644 --- a/src/include/common/internal_types.h +++ b/src/include/common/internal_types.h @@ -820,7 +820,6 @@ std::ostream &operator<<(std::ostream &os, const CopyType &type); enum class ExternalFileFormat { CSV, - BINARY }; std::string ExternalFileFormatToString(ExternalFileFormat format); ExternalFileFormat StringToExternalFileFormat(const std::string &str); @@ -1345,6 +1344,7 @@ enum class RuleType : uint32_t { GET_TO_SEQ_SCAN, GET_TO_INDEX_SCAN, QUERY_DERIVED_GET_TO_PHYSICAL, + EXTERNAL_FILE_GET_TO_PHYSICAL, DELETE_TO_PHYSICAL, UPDATE_TO_PHYSICAL, INSERT_TO_PHYSICAL, From 2c681f05467d10d7c0ee7f05ad520d6cf91d4b12 Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Thu, 26 Apr 2018 02:01:58 -0400 Subject: [PATCH 03/42] Move COPY from DDL to DML processing. COPY now goes through planner/optimization. --- src/executor/copy_executor.cpp | 32 ---------- src/include/executor/copy_executor.h | 3 - .../optimizer/child_property_deriver.h | 1 + src/include/optimizer/cost_calculator.h | 1 + src/include/optimizer/operator_node.h | 3 + src/include/optimizer/operator_visitor.h | 3 + src/include/optimizer/operators.h | 52 +++++++++++++-- src/include/optimizer/optimizer.h | 41 ++++++++---- src/include/optimizer/plan_generator.h | 2 + src/include/optimizer/rule_impls.h | 12 ++++ src/include/optimizer/util.h | 6 -- src/include/planner/copy_plan.h | 10 +-- src/include/planner/csv_scan_plan.h | 46 +++++++++++++ src/optimizer/child_property_deriver.cpp | 6 ++ src/optimizer/cost_calculator.cpp | 5 ++ src/optimizer/operators.cpp | 64 +++++++++++++++++++ src/optimizer/optimizer.cpp | 7 -- src/optimizer/plan_generator.cpp | 5 ++ .../query_to_operator_transformer.cpp | 30 ++++++++- src/optimizer/rule.cpp | 1 + src/optimizer/rule_impls.cpp | 25 ++++++++ src/optimizer/util.cpp | 33 ---------- 22 files changed, 277 insertions(+), 111 deletions(-) create mode 100644 src/include/planner/csv_scan_plan.h diff --git a/src/executor/copy_executor.cpp b/src/executor/copy_executor.cpp index ce16d8c83eb..e55d665bc6c 100644 --- a/src/executor/copy_executor.cpp +++ b/src/executor/copy_executor.cpp @@ -56,11 +56,6 @@ bool CopyExecutor::DInit() { return false; } LOG_DEBUG("Created target copy output file: %s", node.file_path.c_str()); - - // Whether we're copying the parameters which require deserialization - if (node.deserialize_parameters) { - InitParamColIds(); - } return true; } @@ -122,33 +117,6 @@ void CopyExecutor::FFlushFsync() { } } -void CopyExecutor::InitParamColIds() { - // If we're going to deserialize prepared statement, get the column ids for - // the varbinary columns first - // auto catalog = catalog::Catalog::GetInstance(); - // try { - // auto query_metric_table = - // catalog->GetTableWithName(CATALOG_DATABASE_NAME, QUERY_METRIC_NAME); - // auto schema = query_metric_table->GetSchema(); - // auto &cols = schema->GetColumns(); - // for (unsigned int i = 0; i < cols.size(); i++) { - // auto col_name = cols[i].column_name.c_str(); - // if (std::strcmp(col_name, QUERY_PARAM_TYPE_COL_NAME) == 0) { - // param_type_col_id = i; - // } else if (std::strcmp(col_name, QUERY_PARAM_FORMAT_COL_NAME) == 0) { - // param_format_col_id = i; - // } else if (std::strcmp(col_name, QUERY_PARAM_VAL_COL_NAME) == 0) { - // param_val_col_id = i; - // } else if (std::strcmp(col_name, QUERY_NUM_PARAM_COL_NAME) == 0) { - // num_param_col_id = i; - // } - // } - // } - // catch (Exception &e) { - // e.PrintStackTrace(); - // } -} - void CopyExecutor::Copy(const char *data, int len, bool end_of_line) { // Worst case we need to escape all character and two delimiters while (COPY_BUFFER_SIZE - buff_size - buff_ptr < (size_t)len * 3) { diff --git a/src/include/executor/copy_executor.h b/src/include/executor/copy_executor.h index 31d65adaa1b..a95b6c49e86 100644 --- a/src/include/executor/copy_executor.h +++ b/src/include/executor/copy_executor.h @@ -40,9 +40,6 @@ class CopyExecutor : public AbstractExecutor { bool DExecute(); - // Initialize the column ids for query parameters - void InitParamColIds(); - bool InitFileHandle(const char *name, const char *mode); // Flush the local buffer diff --git a/src/include/optimizer/child_property_deriver.h b/src/include/optimizer/child_property_deriver.h index bd4aeb7b933..dd887ff9af3 100644 --- a/src/include/optimizer/child_property_deriver.h +++ b/src/include/optimizer/child_property_deriver.h @@ -39,6 +39,7 @@ class ChildPropertyDeriver : public OperatorVisitor { void Visit(const DummyScan *) override; void Visit(const PhysicalSeqScan *) override; void Visit(const PhysicalIndexScan *) override; + void Visit(const ExternalFileScan *) override; void Visit(const QueryDerivedScan *op) override; void Visit(const PhysicalOrderBy *) override; void Visit(const PhysicalLimit *) override; diff --git a/src/include/optimizer/cost_calculator.h b/src/include/optimizer/cost_calculator.h index 442f386fc5f..8ef40330d6b 100644 --- a/src/include/optimizer/cost_calculator.h +++ b/src/include/optimizer/cost_calculator.h @@ -27,6 +27,7 @@ class CostCalculator : public OperatorVisitor { void Visit(const DummyScan *) override; void Visit(const PhysicalSeqScan *) override; void Visit(const PhysicalIndexScan *) override; + void Visit(const ExternalFileScan *) override; void Visit(const QueryDerivedScan *) override; void Visit(const PhysicalOrderBy *) override; void Visit(const PhysicalLimit *) override; diff --git a/src/include/optimizer/operator_node.h b/src/include/optimizer/operator_node.h index cb20c163bbe..bfc0653518d 100644 --- a/src/include/optimizer/operator_node.h +++ b/src/include/optimizer/operator_node.h @@ -27,6 +27,7 @@ enum class OpType { Leaf, // Logical ops Get, + LogicalExternalFileGet, LogicalQueryDerivedGet, LogicalProjection, LogicalFilter, @@ -45,12 +46,14 @@ enum class OpType { LogicalUpdate, LogicalLimit, LogicalDistinct, + LogicalExportExternalFile, // Separate between logical and physical ops LogicalPhysicalDelimiter, // Physical ops DummyScan, /* Dummy Physical Op for SELECT without FROM*/ SeqScan, IndexScan, + ExternalFileScan, QueryDerivedScan, OrderBy, PhysicalLimit, diff --git a/src/include/optimizer/operator_visitor.h b/src/include/optimizer/operator_visitor.h index 75b0a9f9c67..50fd98fa024 100644 --- a/src/include/optimizer/operator_visitor.h +++ b/src/include/optimizer/operator_visitor.h @@ -29,6 +29,7 @@ class OperatorVisitor { virtual void Visit(const DummyScan *) {} virtual void Visit(const PhysicalSeqScan *) {} virtual void Visit(const PhysicalIndexScan *) {} + virtual void Visit(const ExternalFileScan *) {} virtual void Visit(const QueryDerivedScan *) {} virtual void Visit(const PhysicalOrderBy *) {} virtual void Visit(const PhysicalLimit *) {} @@ -52,6 +53,7 @@ class OperatorVisitor { // Logical operator virtual void Visit(const LeafOperator *) {} virtual void Visit(const LogicalGet *) {} + virtual void Visit(const LogicalExternalFileGet *) {} virtual void Visit(const LogicalQueryDerivedGet *) {} virtual void Visit(const LogicalFilter *) {} virtual void Visit(const LogicalProjection *) {} @@ -70,6 +72,7 @@ class OperatorVisitor { virtual void Visit(const LogicalUpdate *) {} virtual void Visit(const LogicalDistinct *) {} virtual void Visit(const LogicalLimit *) {} + virtual void Visit(const LogicalExportExternalFile *) {} }; } // namespace optimizer diff --git a/src/include/optimizer/operators.h b/src/include/optimizer/operators.h index a745439251a..7e27240973a 100644 --- a/src/include/optimizer/operators.h +++ b/src/include/optimizer/operators.h @@ -1,4 +1,3 @@ - //===----------------------------------------------------------------------===// // // Peloton @@ -7,7 +6,7 @@ // // Identification: src/include/optimizer/operators.h // -// Copyright (c) 2015-16, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -31,7 +30,7 @@ class UpdateClause; } namespace catalog { - class TableCatalogObject; +class TableCatalogObject; } namespace optimizer { @@ -51,10 +50,10 @@ class LeafOperator : OperatorNode { //===--------------------------------------------------------------------===// class LogicalGet : public OperatorNode { public: - static Operator make(oid_t get_id = 0, - std::vector predicates = {}, - std::shared_ptr table = nullptr, - std::string alias = "", bool update = false); + static Operator make( + oid_t get_id = 0, std::vector predicates = {}, + std::shared_ptr table = nullptr, + std::string alias = "", bool update = false); bool operator==(const BaseOperatorNode &r) override; @@ -68,6 +67,21 @@ class LogicalGet : public OperatorNode { bool is_for_update; }; +//===--------------------------------------------------------------------===// +// External file get +//===--------------------------------------------------------------------===// +class LogicalExternalFileGet : public OperatorNode { + public: + static Operator make(oid_t get_id); + + bool operator==(const BaseOperatorNode &r) override; + + hash_t Hash() const override; + + // identifier for all get operators + oid_t get_id; +}; + //===--------------------------------------------------------------------===// // Query derived get //===--------------------------------------------------------------------===// @@ -304,6 +318,15 @@ class LogicalUpdate : public OperatorNode { const std::vector> *updates; }; +//===--------------------------------------------------------------------===// +// External file get +//===--------------------------------------------------------------------===// +class LogicalExportExternalFile + : public OperatorNode { + public: + static Operator make(); +}; + //===--------------------------------------------------------------------===// // DummyScan //===--------------------------------------------------------------------===// @@ -366,6 +389,21 @@ class PhysicalIndexScan : public OperatorNode { std::vector value_list; }; +//===--------------------------------------------------------------------===// +// Physical external file scan +//===--------------------------------------------------------------------===// +class ExternalFileScan : public OperatorNode { + public: + static Operator make(oid_t get_id); + + bool operator==(const BaseOperatorNode &r) override; + + hash_t Hash() const override; + + // identifier for all get operators + oid_t get_id; +}; + //===--------------------------------------------------------------------===// // Query derived get //===--------------------------------------------------------------------===// diff --git a/src/include/optimizer/optimizer.h b/src/include/optimizer/optimizer.h index 82b1d4c9a05..71d7afca265 100644 --- a/src/include/optimizer/optimizer.h +++ b/src/include/optimizer/optimizer.h @@ -38,9 +38,9 @@ class TransactionContext; } namespace test { - class OptimizerRuleTests_SimpleAssociativeRuleTest_Test; - class OptimizerRuleTests_SimpleAssociativeRuleTest2_Test; -} +class OptimizerRuleTests_SimpleAssociativeRuleTest_Test; +class OptimizerRuleTests_SimpleAssociativeRuleTest2_Test; +} namespace optimizer { @@ -60,8 +60,10 @@ class Optimizer : public AbstractOptimizer { friend class BindingIterator; friend class GroupBindingIterator; - friend class ::peloton::test::OptimizerRuleTests_SimpleAssociativeRuleTest_Test; - friend class ::peloton::test::OptimizerRuleTests_SimpleAssociativeRuleTest2_Test; + friend class ::peloton::test:: + OptimizerRuleTests_SimpleAssociativeRuleTest_Test; + friend class ::peloton::test:: + OptimizerRuleTests_SimpleAssociativeRuleTest2_Test; public: Optimizer(const Optimizer &) = delete; @@ -83,28 +85,41 @@ class Optimizer : public AbstractOptimizer { OptimizerMetadata &GetMetadata() { return metadata_; } /* For test purposes only */ - std::shared_ptr TestInsertQueryTree(parser::SQLStatement *tree, - concurrency::TransactionContext *txn) { + std::shared_ptr TestInsertQueryTree( + parser::SQLStatement *tree, concurrency::TransactionContext *txn) { return InsertQueryTree(tree, txn); } /* For test purposes only */ void TestExecuteTaskStack(OptimizerTaskStack &task_stack, int root_group_id, - std::shared_ptr root_context) { + std::shared_ptr root_context) { return ExecuteTaskStack(task_stack, root_group_id, root_context); } private: - /* HandleDDLStatement - Check and handle DDL statment (currently only support - *CREATE), set - * is_ddl_stmt to false if there is no DDL statement. + /** + * Check and handle the provided DDL statement, returning the resulting plan + * if parsed tree is a DDL statement. The is_ddl_stmt parameter is set to + * indicate if the parse tree was indeed a DDL statement. * - * tree: a peloton query tree representing a select query - * return: the DDL plan if it is a DDL statement + * @param tree A parsed SQL statement + * @param[out] is_ddl_stmt Set to true if the SQL statement is DDL + * @param txn The transactional context + * @return The constructed plan tree representing the DDL statement */ std::unique_ptr HandleDDLStatement( parser::SQLStatement *tree, bool &is_ddl_stmt, concurrency::TransactionContext *txn); + /** + * Construct a plan object for the given parsed copy statement. + * + * @param copy_stmt The copy statement we're transforming + * @param txn The transactional context + * @return The construct plan object for the COPY statement + */ + std::unique_ptr HandleDDLCopyStatement( + parser::CopyStatement *copy_stmt, concurrency::TransactionContext *txn); + /* TransformQueryTree - create an initial operator tree for the given query * to be used in performing optimization. * diff --git a/src/include/optimizer/plan_generator.h b/src/include/optimizer/plan_generator.h index c0a21259bc6..353de6db29f 100644 --- a/src/include/optimizer/plan_generator.h +++ b/src/include/optimizer/plan_generator.h @@ -54,6 +54,8 @@ class PlanGenerator : public OperatorVisitor { void Visit(const PhysicalIndexScan *) override; + void Visit(const ExternalFileScan *) override; + void Visit(const QueryDerivedScan *) override; void Visit(const PhysicalOrderBy *) override; diff --git a/src/include/optimizer/rule_impls.h b/src/include/optimizer/rule_impls.h index 2c40e3f3c81..5ace068138d 100644 --- a/src/include/optimizer/rule_impls.h +++ b/src/include/optimizer/rule_impls.h @@ -73,6 +73,18 @@ class GetToSeqScan : public Rule { OptimizeContext *context) const override; }; +class LogicalExternalFileGetToPhysical : public Rule { + public: + LogicalExternalFileGetToPhysical(); + + bool Check(std::shared_ptr plan, + OptimizeContext *context) const override; + + void Transform(std::shared_ptr input, + std::vector> &transformed, + OptimizeContext *context) const override; +}; + /** * @brief Generate dummy scan for queries like "SELECT 1", there's no actual * table to generate diff --git a/src/include/optimizer/util.h b/src/include/optimizer/util.h index 8b9eb4baeef..877bdee3b96 100644 --- a/src/include/optimizer/util.h +++ b/src/include/optimizer/util.h @@ -122,12 +122,6 @@ bool ContainsJoinColumns(const std::unordered_set &l_group_alias, const std::unordered_set &r_group_alias, const expression::AbstractExpression *expr); -/** - * @brief Create a copy plan based on the copy statement - */ -std::unique_ptr CreateCopyPlan( - parser::CopyStatement *copy_stmt); - /** * @brief Construct the map from subquery column name to the actual expression * at the subquery level, for example SELECT a FROM (SELECT a + b as a FROM diff --git a/src/include/planner/copy_plan.h b/src/include/planner/copy_plan.h index fcb991b1666..082598d10af 100644 --- a/src/include/planner/copy_plan.h +++ b/src/include/planner/copy_plan.h @@ -24,12 +24,9 @@ namespace planner { class CopyPlan : public AbstractPlan { public: - explicit CopyPlan(std::string file_path, bool deserialize_parameters) - : file_path(file_path), deserialize_parameters(deserialize_parameters) { - LOG_DEBUG("Creating a Copy Plan"); - } + explicit CopyPlan(std::string file_path) : file_path(std::move(file_path)) {} - PlanNodeType GetPlanNodeType() const override { return PlanNodeType::COPY; } + PlanNodeType GetPlanNodeType() const override { return PlanNodeType::COPY; } const std::string GetInfo() const override { return "CopyPlan"; } @@ -39,9 +36,6 @@ class CopyPlan : public AbstractPlan { // The path of the target file std::string file_path; - // Whether the copying requires deserialization of parameters - bool deserialize_parameters = false; - private: DISALLOW_COPY_AND_MOVE(CopyPlan); }; diff --git a/src/include/planner/csv_scan_plan.h b/src/include/planner/csv_scan_plan.h new file mode 100644 index 00000000000..a58cc87b0f5 --- /dev/null +++ b/src/include/planner/csv_scan_plan.h @@ -0,0 +1,46 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// csv_scan_plan.h +// +// Identification: src/include/planner/csv_scan_plan.h +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "planner/abstract_plan.h" + +namespace peloton { +namespace planner { + +class CSVScanPlan : public AbstractPlan { + public: + CSVScanPlan(const std::string file_name) : file_name_(std::move(file_name)) {} + + PlanNodeType GetPlanNodeType() const override { + return PlanNodeType::CSVSCAN; + } + + std::unique_ptr Copy() const override; + + private: + const std::string file_name_; +}; + +//////////////////////////////////////////////////////////////////////////////// +/// +/// Implementation below +/// +//////////////////////////////////////////////////////////////////////////////// + +inline std::unique_ptr CSVScanPlan::Copy() const { + // TODO + return std::unique_ptr(); +} + +} // namespace planner +} // namespace peloton \ No newline at end of file diff --git a/src/optimizer/child_property_deriver.cpp b/src/optimizer/child_property_deriver.cpp index 1df06b3ea50..5020302b614 100644 --- a/src/optimizer/child_property_deriver.cpp +++ b/src/optimizer/child_property_deriver.cpp @@ -94,6 +94,12 @@ void ChildPropertyDeriver::Visit(const PhysicalIndexScan *op) { make_pair(provided_prop, vector>{})); } +void ChildPropertyDeriver::Visit(const ExternalFileScan *) { + // External file scans (like sequential scans) do not provide properties + output_.push_back( + make_pair(make_shared(), vector>{})); +} + void ChildPropertyDeriver::Visit(const QueryDerivedScan *) { output_.push_back( make_pair(requirements_, vector>{requirements_})); diff --git a/src/optimizer/cost_calculator.cpp b/src/optimizer/cost_calculator.cpp index 5dda9e67c8a..56cbbecc64e 100644 --- a/src/optimizer/cost_calculator.cpp +++ b/src/optimizer/cost_calculator.cpp @@ -59,6 +59,11 @@ void CostCalculator::Visit(UNUSED_ATTRIBUTE const PhysicalIndexScan *op) { memo_->GetGroupByID(gexpr_->GetGroupID())->GetNumRows() * DEFAULT_TUPLE_COST; } + +void CostCalculator::Visit(UNUSED_ATTRIBUTE const ExternalFileScan *) { + output_cost_ = 0.0; +} + void CostCalculator::Visit(UNUSED_ATTRIBUTE const QueryDerivedScan *op) { output_cost_ = 0.f; } diff --git a/src/optimizer/operators.cpp b/src/optimizer/operators.cpp index 78c34d16257..60e074556a3 100644 --- a/src/optimizer/operators.cpp +++ b/src/optimizer/operators.cpp @@ -61,6 +61,28 @@ bool LogicalGet::operator==(const BaseOperatorNode &r) { return get_id == node.get_id; } +//===--------------------------------------------------------------------===// +// External file get +//===--------------------------------------------------------------------===// + +Operator LogicalExternalFileGet::make(oid_t get_id) { + auto *get = new LogicalExternalFileGet(); + get->get_id = get_id; + return Operator(get); +} + +bool LogicalExternalFileGet::operator==(const BaseOperatorNode &node) { + if (node.GetType() != OpType::LogicalExternalFileGet) return false; + const auto &get = *static_cast(&node); + return get_id == get.get_id; +} + +hash_t LogicalExternalFileGet::Hash() const { + hash_t hash = BaseOperatorNode::Hash(); + hash = HashUtil::CombineHashes(hash, HashUtil::Hash(&get_id)); + return hash; +} + //===--------------------------------------------------------------------===// // Query derived get //===--------------------------------------------------------------------===// @@ -411,6 +433,14 @@ Operator LogicalLimit::make(int64_t offset, int64_t limit) { return Operator(limit_op); } +//===--------------------------------------------------------------------===// +// External file output +//===--------------------------------------------------------------------===// +Operator LogicalExportExternalFile::make() { + auto *export_op = new LogicalExternalFileGet(); + return Operator(export_op); +} + //===--------------------------------------------------------------------===// // DummyScan //===--------------------------------------------------------------------===// @@ -506,6 +536,27 @@ hash_t PhysicalIndexScan::Hash() const { return hash; } +//===--------------------------------------------------------------------===// +// Physical external file scan +//===--------------------------------------------------------------------===// +Operator ExternalFileScan::make(oid_t get_id) { + auto *get = new ExternalFileScan(); + get->get_id = get_id; + return Operator(get); +} + +bool ExternalFileScan::operator==(const BaseOperatorNode &node) { + if (node.GetType() != OpType::QueryDerivedScan) return false; + const auto &get = *static_cast(&node); + return get_id == get.get_id; +} + +hash_t ExternalFileScan::Hash() const { + hash_t hash = BaseOperatorNode::Hash(); + hash = HashUtil::CombineHashes(hash, HashUtil::Hash(&get_id)); + return hash; +} + //===--------------------------------------------------------------------===// // Query derived get //===--------------------------------------------------------------------===// @@ -846,6 +897,8 @@ std::string OperatorNode::name_ = "LeafOperator"; template <> std::string OperatorNode::name_ = "LogicalGet"; template <> +std::string OperatorNode::name_ = "LogicalExternalFileGet"; +template <> std::string OperatorNode::name_ = "LogicalQueryDerivedGet"; template <> @@ -884,12 +937,16 @@ std::string OperatorNode::name_ = "LogicalLimit"; template <> std::string OperatorNode::name_ = "LogicalDistinct"; template <> +std::string OperatorNode::name_ = "LogicalExportExternalFile"; +template <> std::string OperatorNode::name_ = "DummyScan"; template <> std::string OperatorNode::name_ = "PhysicalSeqScan"; template <> std::string OperatorNode::name_ = "PhysicalIndexScan"; template <> +std::string OperatorNode::name_ = "ExternalFileScan"; +template <> std::string OperatorNode::name_ = "QueryDerivedScan"; template <> std::string OperatorNode::name_ = "PhysicalOrderBy"; @@ -937,6 +994,8 @@ OpType OperatorNode::type_ = OpType::Leaf; template <> OpType OperatorNode::type_ = OpType::Get; template <> +OpType OperatorNode::type_ = OpType::LogicalExternalFileGet; +template <> OpType OperatorNode::type_ = OpType::LogicalQueryDerivedGet; template <> @@ -974,6 +1033,9 @@ template <> OpType OperatorNode::type_ = OpType::LogicalDistinct; template <> OpType OperatorNode::type_ = OpType::LogicalLimit; +template <> +OpType OperatorNode::type_ = OpType::LogicalExportExternalFile; + template <> OpType OperatorNode::type_ = OpType::DummyScan; template <> @@ -981,6 +1043,8 @@ OpType OperatorNode::type_ = OpType::SeqScan; template <> OpType OperatorNode::type_ = OpType::IndexScan; template <> +OpType OperatorNode::type_ = OpType::ExternalFileScan; +template <> OpType OperatorNode::type_ = OpType::QueryDerivedScan; template <> OpType OperatorNode::type_ = OpType::OrderBy; diff --git a/src/optimizer/optimizer.cpp b/src/optimizer/optimizer.cpp index 62f813ec876..5722034ef7d 100644 --- a/src/optimizer/optimizer.cpp +++ b/src/optimizer/optimizer.cpp @@ -214,13 +214,6 @@ unique_ptr Optimizer::HandleDDLStatement( ddl_plan = move(analyze_plan); break; } - case StatementType::COPY: { - LOG_TRACE("Adding Copy plan..."); - parser::CopyStatement *copy_parse_tree = - static_cast(tree); - ddl_plan = util::CreateCopyPlan(copy_parse_tree); - break; - } default: is_ddl_stmt = false; } diff --git a/src/optimizer/plan_generator.cpp b/src/optimizer/plan_generator.cpp index a16b70c3878..b592e92fe4c 100644 --- a/src/optimizer/plan_generator.cpp +++ b/src/optimizer/plan_generator.cpp @@ -20,6 +20,7 @@ #include "optimizer/operator_expression.h" #include "optimizer/properties.h" #include "planner/aggregate_plan.h" +#include "planner/csv_scan_plan.h" #include "planner/delete_plan.h" #include "planner/hash_join_plan.h" #include "planner/hash_plan.h" @@ -127,6 +128,10 @@ void PlanGenerator::Visit(const PhysicalIndexScan *op) { predicate.release(), column_ids, index_scan_desc, false)); } +void PlanGenerator::Visit(const ExternalFileScan *) { + output_plan_.reset(new planner::CSVScanPlan("sdfsdf")); +} + void PlanGenerator::Visit(const QueryDerivedScan *) { PELOTON_ASSERT(children_plans_.size() == 1); output_plan_ = move(children_plans_[0]); diff --git a/src/optimizer/query_to_operator_transformer.cpp b/src/optimizer/query_to_operator_transformer.cpp index ff75140d5f5..b0f9b8f4446 100644 --- a/src/optimizer/query_to_operator_transformer.cpp +++ b/src/optimizer/query_to_operator_transformer.cpp @@ -359,8 +359,34 @@ void QueryToOperatorTransformer::Visit(parser::UpdateStatement *op) { output_expr_ = update_expr; } -void QueryToOperatorTransformer::Visit( - UNUSED_ATTRIBUTE parser::CopyStatement *op) {} +void QueryToOperatorTransformer::Visit(parser::CopyStatement *op) { + if (op->is_from) { + auto get_op = std::make_shared( + LogicalExternalFileGet::make(GetAndIncreaseGetId())); + + auto target_table = + catalog::Catalog::GetInstance() + ->GetDatabaseObject(op->table->GetDatabaseName(), txn_) + ->GetTableObject(op->table->GetTableName()); + + auto insert_expr = std::make_shared( + LogicalInsertSelect::make(target_table)); + + insert_expr->PushChild(get_op); + output_expr_ = insert_expr; + } else { + if (op->select_stmt != nullptr) { + op->select_stmt->Accept(this); + } else { + op->table->Accept(this); + } + auto export_op = + std::make_shared(LogicalExportExternalFile::make()); + export_op->PushChild(output_expr_); + output_expr_ = export_op; + } +} + void QueryToOperatorTransformer::Visit( UNUSED_ATTRIBUTE parser::AnalyzeStatement *op) {} diff --git a/src/optimizer/rule.cpp b/src/optimizer/rule.cpp index 1e81799147d..fc4bc837736 100644 --- a/src/optimizer/rule.cpp +++ b/src/optimizer/rule.cpp @@ -39,6 +39,7 @@ RuleSet::RuleSet() { AddImplementationRule(new GetToDummyScan()); AddImplementationRule(new GetToSeqScan()); AddImplementationRule(new GetToIndexScan()); + AddImplementationRule(new LogicalExternalFileGetToPhysical()); AddImplementationRule(new LogicalQueryDerivedGetToPhysical()); AddImplementationRule(new InnerJoinToInnerNLJoin()); AddImplementationRule(new InnerJoinToInnerHashJoin()); diff --git a/src/optimizer/rule_impls.cpp b/src/optimizer/rule_impls.cpp index e540555c9e3..fcab5412621 100644 --- a/src/optimizer/rule_impls.cpp +++ b/src/optimizer/rule_impls.cpp @@ -440,6 +440,31 @@ void LogicalQueryDerivedGetToPhysical::Transform( transformed.push_back(result_plan); } +/////////////////////////////////////////////////////////////////////////////// +/// LogicalExternalFileGetToPhysical +LogicalExternalFileGetToPhysical::LogicalExternalFileGetToPhysical() { + type_ = RuleType::EXTERNAL_FILE_GET_TO_PHYSICAL; + match_pattern = std::make_shared(OpType::LogicalExternalFileGet); +} + +bool LogicalExternalFileGetToPhysical::Check( + UNUSED_ATTRIBUTE std::shared_ptr plan, + UNUSED_ATTRIBUTE OptimizeContext *context) const { + return true; +} + +void LogicalExternalFileGetToPhysical::Transform( + std::shared_ptr input, + std::vector> &transformed, + UNUSED_ATTRIBUTE OptimizeContext *context) const { + const auto *get = input->Op().As(); + + auto result_plan = + std::make_shared(ExternalFileScan::make(get->get_id)); + PELOTON_ASSERT(input->Children().empty()); + transformed.push_back(result_plan); +} + /////////////////////////////////////////////////////////////////////////////// /// LogicalDeleteToPhysical LogicalDeleteToPhysical::LogicalDeleteToPhysical() { diff --git a/src/optimizer/util.cpp b/src/optimizer/util.cpp index b0129484442..4ff60ee36c8 100644 --- a/src/optimizer/util.cpp +++ b/src/optimizer/util.cpp @@ -142,39 +142,6 @@ bool ContainsJoinColumns(const std::unordered_set &l_group_alias, return false; } -std::unique_ptr CreateCopyPlan( - parser::CopyStatement *copy_stmt) { - std::string table_name(copy_stmt->table->GetTableName()); - bool deserialize_parameters = false; - - // If we're copying the query metric table, then we need to handle the - // deserialization of prepared stmt parameters - if (table_name == QUERY_METRICS_CATALOG_NAME) { - LOG_DEBUG("Copying the query_metric table."); - deserialize_parameters = true; - } - - std::unique_ptr copy_plan( - new planner::CopyPlan(copy_stmt->file_path, deserialize_parameters)); - - auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); - auto txn = txn_manager.BeginTransaction(); - auto target_table = catalog::Catalog::GetInstance()->GetTableWithName( - copy_stmt->table->GetDatabaseName(), - copy_stmt->table->GetSchemaName(), - copy_stmt->table->GetTableName(), txn); - txn_manager.CommitTransaction(txn); - - std::unique_ptr select_plan( - new planner::SeqScanPlan(target_table, nullptr, {}, false)); - - LOG_DEBUG("Sequential scan plan for copy created"); - - // Attach it to the copy plan - copy_plan->AddChild(std::move(select_plan)); - return copy_plan; -} - std::unordered_map> ConstructSelectElementMap( std::vector> &select_list) { From c749fe288adbcef136c7c07d06ccf7c985c42ab5 Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Thu, 26 Apr 2018 02:19:13 -0400 Subject: [PATCH 04/42] Propagate external file information --- src/executor/plan_executor.cpp | 12 ++--- src/include/optimizer/operators.h | 34 +++++++++---- src/include/planner/csv_scan_plan.h | 2 +- src/optimizer/operators.cpp | 48 +++++++++++++------ src/optimizer/plan_generator.cpp | 9 +++- .../query_to_operator_transformer.cpp | 13 ++--- src/optimizer/rule_impls.cpp | 21 ++++---- 7 files changed, 88 insertions(+), 51 deletions(-) diff --git a/src/executor/plan_executor.cpp b/src/executor/plan_executor.cpp index feca24cec2f..a01330b7b6d 100644 --- a/src/executor/plan_executor.cpp +++ b/src/executor/plan_executor.cpp @@ -170,9 +170,9 @@ void PlanExecutor::ExecutePlan( } catch (Exception &e) { ExecutionResult result; result.m_result = ResultType::FAILURE; - result.m_error_message = e.what(); - LOG_ERROR("Error thrown during execution: %s", - result.m_error_message.c_str()); + result.m_error_message = + StringUtil::Format("ERROR: during execution ['%s']", e.what()); + LOG_ERROR("Error during execution: %s", e.what()); on_complete(result, {}); } } @@ -349,9 +349,9 @@ executor::AbstractExecutor *BuildExecutorTree( break; default: - LOG_ERROR("Unsupported plan node type : %s", - PlanNodeTypeToString(plan_node_type).c_str()); - break; + throw NotImplementedException{ + StringUtil::Format("Unsupported plan node type : %s", + PlanNodeTypeToString(plan_node_type).c_str())}; } LOG_TRACE("Adding %s Executor", PlanNodeTypeToString(plan_node_type).c_str()); diff --git a/src/include/optimizer/operators.h b/src/include/optimizer/operators.h index 7e27240973a..8ec891c8131 100644 --- a/src/include/optimizer/operators.h +++ b/src/include/optimizer/operators.h @@ -72,7 +72,8 @@ class LogicalGet : public OperatorNode { //===--------------------------------------------------------------------===// class LogicalExternalFileGet : public OperatorNode { public: - static Operator make(oid_t get_id); + static Operator make(oid_t get_id, ExternalFileFormat format, + std::string file_name); bool operator==(const BaseOperatorNode &r) override; @@ -80,6 +81,8 @@ class LogicalExternalFileGet : public OperatorNode { // identifier for all get operators oid_t get_id; + ExternalFileFormat format; + std::string file_name; }; //===--------------------------------------------------------------------===// @@ -260,7 +263,8 @@ class LogicalAggregateAndGroupBy class LogicalInsert : public OperatorNode { public: static Operator make( - std::shared_ptr target_table, const std::vector *columns, + std::shared_ptr target_table, + const std::vector *columns, const std::vector>> *values); @@ -272,7 +276,8 @@ class LogicalInsert : public OperatorNode { class LogicalInsertSelect : public OperatorNode { public: - static Operator make(std::shared_ptr target_table); + static Operator make( + std::shared_ptr target_table); std::shared_ptr target_table; }; @@ -300,7 +305,8 @@ class LogicalLimit : public OperatorNode { //===--------------------------------------------------------------------===// class LogicalDelete : public OperatorNode { public: - static Operator make(std::shared_ptr target_table); + static Operator make( + std::shared_ptr target_table); std::shared_ptr target_table; }; @@ -340,7 +346,8 @@ class DummyScan : public OperatorNode { //===--------------------------------------------------------------------===// class PhysicalSeqScan : public OperatorNode { public: - static Operator make(oid_t get_id, std::shared_ptr table, + static Operator make(oid_t get_id, + std::shared_ptr table, std::string alias, std::vector predicates, bool update); @@ -362,7 +369,8 @@ class PhysicalSeqScan : public OperatorNode { //===--------------------------------------------------------------------===// class PhysicalIndexScan : public OperatorNode { public: - static Operator make(oid_t get_id, std::shared_ptr table, + static Operator make(oid_t get_id, + std::shared_ptr table, std::string alias, std::vector predicates, bool update, oid_t index_id, std::vector key_column_id_list, @@ -394,7 +402,8 @@ class PhysicalIndexScan : public OperatorNode { //===--------------------------------------------------------------------===// class ExternalFileScan : public OperatorNode { public: - static Operator make(oid_t get_id); + static Operator make(oid_t get_id, ExternalFileFormat format, + std::string file_name); bool operator==(const BaseOperatorNode &r) override; @@ -402,6 +411,8 @@ class ExternalFileScan : public OperatorNode { // identifier for all get operators oid_t get_id; + ExternalFileFormat format; + std::string file_name; }; //===--------------------------------------------------------------------===// @@ -551,7 +562,8 @@ class PhysicalOuterHashJoin : public OperatorNode { class PhysicalInsert : public OperatorNode { public: static Operator make( - std::shared_ptr target_table, const std::vector *columns, + std::shared_ptr target_table, + const std::vector *columns, const std::vector>> *values); @@ -563,7 +575,8 @@ class PhysicalInsert : public OperatorNode { class PhysicalInsertSelect : public OperatorNode { public: - static Operator make(std::shared_ptr target_table); + static Operator make( + std::shared_ptr target_table); std::shared_ptr target_table; }; @@ -573,7 +586,8 @@ class PhysicalInsertSelect : public OperatorNode { //===--------------------------------------------------------------------===// class PhysicalDelete : public OperatorNode { public: - static Operator make(std::shared_ptr target_table); + static Operator make( + std::shared_ptr target_table); std::shared_ptr target_table; }; diff --git a/src/include/planner/csv_scan_plan.h b/src/include/planner/csv_scan_plan.h index a58cc87b0f5..e871b15ac02 100644 --- a/src/include/planner/csv_scan_plan.h +++ b/src/include/planner/csv_scan_plan.h @@ -19,7 +19,7 @@ namespace planner { class CSVScanPlan : public AbstractPlan { public: - CSVScanPlan(const std::string file_name) : file_name_(std::move(file_name)) {} + CSVScanPlan(std::string file_name) : file_name_(std::move(file_name)) {} PlanNodeType GetPlanNodeType() const override { return PlanNodeType::CSVSCAN; diff --git a/src/optimizer/operators.cpp b/src/optimizer/operators.cpp index 60e074556a3..c9fb133bc90 100644 --- a/src/optimizer/operators.cpp +++ b/src/optimizer/operators.cpp @@ -11,11 +11,13 @@ //===----------------------------------------------------------------------===// #include "optimizer/operators.h" + #include "optimizer/operator_visitor.h" #include "expression/expression_util.h" namespace peloton { namespace optimizer { + //===--------------------------------------------------------------------===// // Leaf //===--------------------------------------------------------------------===// @@ -51,7 +53,7 @@ hash_t LogicalGet::Hash() const { } bool LogicalGet::operator==(const BaseOperatorNode &r) { - if (r.GetType()!= OpType::Get) return false; + if (r.GetType() != OpType::Get) return false; const LogicalGet &node = *static_cast(&r); if (predicates.size() != node.predicates.size()) return false; for (size_t i = 0; i < predicates.size(); i++) { @@ -65,21 +67,28 @@ bool LogicalGet::operator==(const BaseOperatorNode &r) { // External file get //===--------------------------------------------------------------------===// -Operator LogicalExternalFileGet::make(oid_t get_id) { +Operator LogicalExternalFileGet::make(oid_t get_id, ExternalFileFormat format, + std::string file_name) { auto *get = new LogicalExternalFileGet(); get->get_id = get_id; + get->format = format; + get->file_name = std::move(file_name); return Operator(get); } bool LogicalExternalFileGet::operator==(const BaseOperatorNode &node) { if (node.GetType() != OpType::LogicalExternalFileGet) return false; - const auto &get = *static_cast(&node); - return get_id == get.get_id; + const auto &get = *static_cast(&node); + return (get_id == get.get_id && format == get.format && + file_name == get.file_name); } hash_t LogicalExternalFileGet::Hash() const { hash_t hash = BaseOperatorNode::Hash(); hash = HashUtil::CombineHashes(hash, HashUtil::Hash(&get_id)); + hash = HashUtil::CombineHashes(hash, HashUtil::Hash(&format)); + hash = HashUtil::CombineHashes( + hash, HashUtil::HashBytes(file_name.data(), file_name.length())); return hash; } @@ -407,8 +416,8 @@ Operator LogicalDelete::make( //===--------------------------------------------------------------------===// Operator LogicalUpdate::make( std::shared_ptr target_table, - const std::vector> - *updates) { + const std::vector> * + updates) { LogicalUpdate *update_op = new LogicalUpdate; update_op->target_table = target_table; update_op->updates = updates; @@ -539,21 +548,28 @@ hash_t PhysicalIndexScan::Hash() const { //===--------------------------------------------------------------------===// // Physical external file scan //===--------------------------------------------------------------------===// -Operator ExternalFileScan::make(oid_t get_id) { +Operator ExternalFileScan::make(oid_t get_id, ExternalFileFormat format, + std::string file_name) { auto *get = new ExternalFileScan(); get->get_id = get_id; + get->format = format; + get->file_name = file_name; return Operator(get); } bool ExternalFileScan::operator==(const BaseOperatorNode &node) { if (node.GetType() != OpType::QueryDerivedScan) return false; const auto &get = *static_cast(&node); - return get_id == get.get_id; + return (get_id == get.get_id && format == get.format && + file_name == get.file_name); } hash_t ExternalFileScan::Hash() const { hash_t hash = BaseOperatorNode::Hash(); hash = HashUtil::CombineHashes(hash, HashUtil::Hash(&get_id)); + hash = HashUtil::CombineHashes(hash, HashUtil::Hash(&format)); + hash = HashUtil::CombineHashes( + hash, HashUtil::HashBytes(file_name.data(), file_name.length())); return hash; } @@ -799,8 +815,8 @@ Operator PhysicalDelete::make( //===--------------------------------------------------------------------===// Operator PhysicalUpdate::make( std::shared_ptr target_table, - const std::vector> - *updates) { + const std::vector> * + updates) { PhysicalUpdate *update = new PhysicalUpdate; update->target_table = target_table; update->updates = updates; @@ -897,7 +913,8 @@ std::string OperatorNode::name_ = "LeafOperator"; template <> std::string OperatorNode::name_ = "LogicalGet"; template <> -std::string OperatorNode::name_ = "LogicalExternalFileGet"; +std::string OperatorNode::name_ = + "LogicalExternalFileGet"; template <> std::string OperatorNode::name_ = "LogicalQueryDerivedGet"; @@ -937,7 +954,8 @@ std::string OperatorNode::name_ = "LogicalLimit"; template <> std::string OperatorNode::name_ = "LogicalDistinct"; template <> -std::string OperatorNode::name_ = "LogicalExportExternalFile"; +std::string OperatorNode::name_ = + "LogicalExportExternalFile"; template <> std::string OperatorNode::name_ = "DummyScan"; template <> @@ -994,7 +1012,8 @@ OpType OperatorNode::type_ = OpType::Leaf; template <> OpType OperatorNode::type_ = OpType::Get; template <> -OpType OperatorNode::type_ = OpType::LogicalExternalFileGet; +OpType OperatorNode::type_ = + OpType::LogicalExternalFileGet; template <> OpType OperatorNode::type_ = OpType::LogicalQueryDerivedGet; @@ -1034,7 +1053,8 @@ OpType OperatorNode::type_ = OpType::LogicalDistinct; template <> OpType OperatorNode::type_ = OpType::LogicalLimit; template <> -OpType OperatorNode::type_ = OpType::LogicalExportExternalFile; +OpType OperatorNode::type_ = + OpType::LogicalExportExternalFile; template <> OpType OperatorNode::type_ = OpType::DummyScan; diff --git a/src/optimizer/plan_generator.cpp b/src/optimizer/plan_generator.cpp index b592e92fe4c..6960e4ddd25 100644 --- a/src/optimizer/plan_generator.cpp +++ b/src/optimizer/plan_generator.cpp @@ -128,8 +128,13 @@ void PlanGenerator::Visit(const PhysicalIndexScan *op) { predicate.release(), column_ids, index_scan_desc, false)); } -void PlanGenerator::Visit(const ExternalFileScan *) { - output_plan_.reset(new planner::CSVScanPlan("sdfsdf")); +void PlanGenerator::Visit(const ExternalFileScan *op) { + switch (op->format) { + case ExternalFileFormat::CSV: { + output_plan_.reset(new planner::CSVScanPlan(op->file_name)); + break; + } + } } void PlanGenerator::Visit(const QueryDerivedScan *) { diff --git a/src/optimizer/query_to_operator_transformer.cpp b/src/optimizer/query_to_operator_transformer.cpp index b0f9b8f4446..f5f05d6c6aa 100644 --- a/src/optimizer/query_to_operator_transformer.cpp +++ b/src/optimizer/query_to_operator_transformer.cpp @@ -259,10 +259,10 @@ void QueryToOperatorTransformer::Visit(parser::InsertStatement *op) { if (column_objects[i]->IsNotNull()) { // TODO: Add check for default value's existence for the current // column - throw CatalogException( - StringUtil::Format("ERROR: null value in column \"%s\" " - "violates not-null constraint", - column_objects[i]->GetColumnName().c_str())); + throw CatalogException(StringUtil::Format( + "ERROR: null value in column \"%s\" " + "violates not-null constraint", + column_objects[i]->GetColumnName().c_str())); } } } @@ -361,8 +361,9 @@ void QueryToOperatorTransformer::Visit(parser::UpdateStatement *op) { } void QueryToOperatorTransformer::Visit(parser::CopyStatement *op) { if (op->is_from) { - auto get_op = std::make_shared( - LogicalExternalFileGet::make(GetAndIncreaseGetId())); + auto get_op = + std::make_shared(LogicalExternalFileGet::make( + GetAndIncreaseGetId(), op->format, op->file_path)); auto target_table = catalog::Catalog::GetInstance() diff --git a/src/optimizer/rule_impls.cpp b/src/optimizer/rule_impls.cpp index fcab5412621..284109a38f1 100644 --- a/src/optimizer/rule_impls.cpp +++ b/src/optimizer/rule_impls.cpp @@ -275,9 +275,8 @@ void GetToIndexScan::Transform( sort_by_asc_base_column = false; break; } - auto bound_oids = - reinterpret_cast(expr) - ->GetBoundOid(); + auto bound_oids = reinterpret_cast( + expr)->GetBoundOid(); sort_col_ids.push_back(std::get<2>(bound_oids)); } // Check whether any index can fulfill sort property @@ -358,20 +357,16 @@ void GetToIndexScan::Transform( if (value_expr->GetExpressionType() == ExpressionType::VALUE_CONSTANT) { value_list.push_back( reinterpret_cast( - value_expr) - ->GetValue()); + value_expr)->GetValue()); LOG_TRACE("Value Type: %d", static_cast( reinterpret_cast( - expr->GetModifiableChild(1)) - ->GetValueType())); + expr->GetModifiableChild(1))->GetValueType())); } else { value_list.push_back( type::ValueFactory::GetParameterOffsetValue( reinterpret_cast( - value_expr) - ->GetValueIdx()) - .Copy()); + value_expr)->GetValueIdx()).Copy()); LOG_TRACE("Parameter offset: %s", (*value_list.rbegin()).GetInfo().c_str()); } @@ -459,9 +454,11 @@ void LogicalExternalFileGetToPhysical::Transform( UNUSED_ATTRIBUTE OptimizeContext *context) const { const auto *get = input->Op().As(); - auto result_plan = - std::make_shared(ExternalFileScan::make(get->get_id)); + auto result_plan = std::make_shared( + ExternalFileScan::make(get->get_id, get->format, get->file_name)); + PELOTON_ASSERT(input->Children().empty()); + transformed.push_back(result_plan); } From 73d583ff2ad91e661d8a6761fd430fcbc52bf873 Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Fri, 27 Apr 2018 09:28:30 -0400 Subject: [PATCH 05/42] Removed unused serialization stuff from plan nodes --- src/include/planner/abstract_plan.h | 27 --- src/include/planner/abstract_scan_plan.h | 2 - src/include/planner/seq_scan_plan.h | 19 +- src/planner/abstract_plan.cpp | 3 - src/planner/seq_scan_plan.cpp | 227 +---------------------- 5 files changed, 11 insertions(+), 267 deletions(-) diff --git a/src/include/planner/abstract_plan.h b/src/include/planner/abstract_plan.h index c257b20d830..bb1428f81d4 100644 --- a/src/include/planner/abstract_plan.h +++ b/src/include/planner/abstract_plan.h @@ -20,8 +20,6 @@ #include "codegen/query_parameters_map.h" #include "common/printable.h" #include "planner/binding_context.h" -#include "type/serializeio.h" -#include "type/serializer.h" #include "common/internal_types.h" #include "type/value.h" #include "util/hash_util.h" @@ -66,8 +64,6 @@ class AbstractPlan : public Printable { const AbstractPlan *GetChild(uint32_t child_index) const; - const AbstractPlan *GetParent() const; - //===--------------------------------------------------------------------===// // Accessors //===--------------------------------------------------------------------===// @@ -111,23 +107,6 @@ class AbstractPlan : public Printable { virtual std::unique_ptr Copy() const = 0; - // A plan will be sent to anther node via serialization - // So serialization should be implemented by the derived classes - - //===--------------------------------------------------------------------===// - // Serialization/Deserialization - // Each sub-class will have to implement these functions - // After the implementation for each sub-class, we should set these to pure - // virtual - //===--------------------------------------------------------------------===// - virtual bool SerializeTo(SerializeOutput &output UNUSED_ATTRIBUTE) const { - return false; - } - virtual bool DeserializeFrom(SerializeInput &input UNUSED_ATTRIBUTE) { - return false; - } - virtual int SerializeSize() const { return 0; } - virtual hash_t Hash() const; virtual bool operator==(const AbstractPlan &rhs) const; @@ -143,16 +122,10 @@ class AbstractPlan : public Printable { } } - protected: - // only used by its derived classes (when deserialization) - AbstractPlan *Parent() const { return parent_; } - private: // A plan node can have multiple children std::vector> children_; - AbstractPlan *parent_ = nullptr; - // TODO: This field is harded coded now. This needs to be changed when // optimizer has the cost model and cardinality estimation int estimated_cardinality_ = 500000; diff --git a/src/include/planner/abstract_scan_plan.h b/src/include/planner/abstract_scan_plan.h index 816676736b5..099bf5a161b 100644 --- a/src/include/planner/abstract_scan_plan.h +++ b/src/include/planner/abstract_scan_plan.h @@ -71,8 +71,6 @@ class AbstractScan : public AbstractPlan { protected: void SetTargetTable(storage::DataTable *table) { target_table_ = table; } - void AddColumnId(oid_t col_id) { column_ids_.push_back(col_id); } - void SetPredicate(expression::AbstractExpression *predicate) { predicate_ = std::unique_ptr(predicate); } diff --git a/src/include/planner/seq_scan_plan.h b/src/include/planner/seq_scan_plan.h index 9f0f411f2cb..fed2f12d783 100644 --- a/src/include/planner/seq_scan_plan.h +++ b/src/include/planner/seq_scan_plan.h @@ -18,10 +18,20 @@ #include "common/internal_types.h" #include "common/logger.h" +#include "expression/abstract_expression.h" #include "planner/abstract_scan_plan.h" #include "type/serializer.h" namespace peloton { + +namespace expression { +class Parameter; +} // namespace expression + +namespace storage { +class DataTable; +} // namespace storage + namespace planner { class SeqScanPlan : public AbstractScan { @@ -48,15 +58,6 @@ class SeqScanPlan : public AbstractScan { void SetParameterValues(std::vector *values) override; - //===--------------------------------------------------------------------===// - // Serialization/Deserialization - //===--------------------------------------------------------------------===// - bool SerializeTo(SerializeOutput &output) const override; - bool DeserializeFrom(SerializeInput &input) override; - - /* For init SerializeOutput */ - int SerializeSize() const override; - std::unique_ptr Copy() const override { auto *new_plan = new SeqScanPlan(GetTable(), GetPredicate()->Copy(), GetColumnIds()); diff --git a/src/planner/abstract_plan.cpp b/src/planner/abstract_plan.cpp index 241323bb0e9..49014a6f471 100644 --- a/src/planner/abstract_plan.cpp +++ b/src/planner/abstract_plan.cpp @@ -14,7 +14,6 @@ #include "common/logger.h" #include "common/macros.h" -#include "expression/expression_util.h" #include "util/hash_util.h" namespace peloton { @@ -38,8 +37,6 @@ const AbstractPlan *AbstractPlan::GetChild(uint32_t child_index) const { return children_[child_index].get(); } -const AbstractPlan *AbstractPlan::GetParent() const { return parent_; } - // Get a string representation of this plan std::ostream &operator<<(std::ostream &os, const AbstractPlan &plan) { os << PlanNodeTypeToString(plan.GetPlanNodeType()); diff --git a/src/planner/seq_scan_plan.cpp b/src/planner/seq_scan_plan.cpp index 62e8299aae7..7c3ba3d8a14 100644 --- a/src/planner/seq_scan_plan.cpp +++ b/src/planner/seq_scan_plan.cpp @@ -6,246 +6,21 @@ // // Identification: src/planner/seq_scan_plan.cpp // -// Copyright (c) 2015-17, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// #include "planner/seq_scan_plan.h" -#include "parser/select_statement.h" -#include "catalog/manager.h" -#include "catalog/schema.h" #include "common/logger.h" #include "common/macros.h" #include "expression/abstract_expression.h" -#include "expression/expression_util.h" #include "storage/data_table.h" -#include "storage/storage_manager.h" #include "common/internal_types.h" namespace peloton { namespace planner { -//===--------------------------------------------------------------------===// -// Serialization/Deserialization -//===--------------------------------------------------------------------===// - -/** - * The SeqScanPlan has the following members: - * database_id, table_id, predicate, column_id, parent(might be NULL) - * TODO: SeqScanPlan doesn't have children, so we don't need to handle it - * - * Therefore a SeqScanPlan is serialized as: - * [(int) total size] - * [(int8_t) plan type] - * [(int) database_id] - * [(int) table_id] - * [(int) num column_id] - * [(int) column id...] - * [(int8_t) expr type] : if invalid, predicate is null - * [(bytes) predicate] : predicate is Expression - * [(int8_t) plan type] : if invalid, parent is null - * [(bytes) parent] : parent is also a plan - * - * TODO: parent_ seems never be set or used - */ - -bool SeqScanPlan::SerializeTo(SerializeOutput &output) const { - // A placeholder for the total size written at the end - int start = output.Position(); - output.WriteInt(-1); - - // Write the SeqScanPlan type - PlanNodeType plan_type = GetPlanNodeType(); - output.WriteByte(static_cast(plan_type)); - - // Write database id and table id - if (!GetTable()) { - // The plan is not completed - return false; - } - oid_t database_id = GetTable()->GetDatabaseOid(); - oid_t table_id = GetTable()->GetOid(); - - output.WriteInt(static_cast(database_id)); - output.WriteInt(static_cast(table_id)); - - // If column has 0 item, just write the columnid_count with 0 - int columnid_count = GetColumnIds().size(); - output.WriteInt(columnid_count); - - // If column has 0 item, nothing happens here - for (int it = 0; it < columnid_count; it++) { - oid_t col_id = GetColumnIds()[it]; - output.WriteInt(static_cast(col_id)); - } - - // Write predicate - if (GetPredicate() == nullptr) { - // Write the type - output.WriteByte(static_cast(ExpressionType::INVALID)); - } else { - // Write the expression type - ExpressionType expr_type = GetPredicate()->GetExpressionType(); - output.WriteByte(static_cast(expr_type)); - } - - // Write parent, but parent seems never be set or used right now - if (GetParent() == nullptr) { - // Write the type - output.WriteByte(static_cast(PlanNodeType::INVALID)); - } else { - // Write the parent type - PlanNodeType parent_type = GetParent()->GetPlanNodeType(); - output.WriteByte(static_cast(parent_type)); - - // Write parent - GetParent()->SerializeTo(output); - } - - // Write the total length - int32_t sz = static_cast(output.Position() - start - sizeof(int)); - PELOTON_ASSERT(sz > 0); - output.WriteIntAt(start, sz); - - return true; -} - -/** - * Therefore a SeqScanPlan is serialized as: - * [(int) total size] - * [(int8_t) plan type] - * [(int) database_id] - * [(int) table_id] - * [(int) num column_id] - * [(int) column id...] - * [(int8_t) expr type] : if invalid, predicate is null - * [(bytes) predicate] : predicate is Expression - * [(int8_t) plan type] : if invalid, parent is null - * [(bytes) parent] : parent is also a plan - */ -bool SeqScanPlan::DeserializeFrom(SerializeInput &input) { - // Read the size of SeqScanPlan class - input.ReadInt(); - - // Read the type - UNUSED_ATTRIBUTE PlanNodeType plan_type = - (PlanNodeType)input.ReadEnumInSingleByte(); - PELOTON_ASSERT(plan_type == GetPlanNodeType()); - - // Read database id - oid_t database_oid = input.ReadInt(); - - // Read table id - oid_t table_oid = input.ReadInt(); - - // Get table and set it to the member - storage::DataTable *target_table = nullptr; - try{ - target_table = static_cast( - storage::StorageManager::GetInstance()->GetTableWithOid( - database_oid, table_oid)); - } catch (CatalogException &e) { - LOG_TRACE("Can't find table %d! Return false", table_oid); - return false; - } - SetTargetTable(target_table); - - // Read the number of column_id and set them to column_ids_ - oid_t columnid_count = input.ReadInt(); - for (oid_t it = 0; it < columnid_count; it++) { - oid_t column_id = input.ReadInt(); - AddColumnId(column_id); - } - - // Read the type - ExpressionType expr_type = (ExpressionType)input.ReadEnumInSingleByte(); - - // Predicate deserialization - if (expr_type != ExpressionType::INVALID) { - switch (expr_type) { - // case ExpressionType::COMPARE_IN: - // predicate_ = - // std::unique_ptr(new - // ComparisonExpression (101)); - // predicate_.DeserializeFrom(input); - // break; - - default: { - LOG_ERROR( - "Expression deserialization :: Unsupported EXPRESSION_TYPE: %s", - ExpressionTypeToString(expr_type).c_str()); - break; - } - } - } - - // Read the type of parent - PlanNodeType parent_type = (PlanNodeType)input.ReadEnumInSingleByte(); - - // Parent deserialization - if (parent_type != PlanNodeType::INVALID) { - switch (expr_type) { - // case ExpressionType::COMPARE_IN: - // predicate_ = - // std::unique_ptr(new - // ComparisonExpression (101)); - // predicate_.DeserializeFrom(input); - // break; - - default: { - LOG_ERROR("Parent deserialization :: Unsupported PlanNodeType: %s", - ExpressionTypeToString(expr_type).c_str()); - break; - } - } - } - - return true; -} -/** - * - * SeqScanPlan is serialized as: - * [(int) total size] - * [(int8_t) plan type] - * [(int) database_id] - * [(int) table_id] - * [(int) num column_id] - * [(int) column id...] - * [(int8_t) expr type] : if invalid, predicate is null - * [(bytes) predicate] : predicate is Expression - * [(int8_t) plan type] : if invalid, parent is null - * [(bytes) parent] : parent is also a plan - * - * So, the fixed size part is: - * [(int) total size] 4 + - * [(int8_t) plan type] 1 + - * [(int) database_id] 4 + - * [(int) table_id] 4 + - * [(int) num column_id]4 + - * [(int8_t) expr type] 1 + - * [(int8_t) plan type] 1 = - * the variant part is : - * [(int) column id...]: num column_id * 4 - * [(bytes) predicate] : predicate->GetSerializeSize() - * [(bytes) parent] : parent->GetSerializeSize() - */ -int SeqScanPlan::SerializeSize() const { - // Fixed size. see the detail above - int size_fix = sizeof(int) * 4 + 3; - int size_column_ids = GetColumnIds().size() * sizeof(int); - int size = size_fix + size_column_ids; - - if (GetPredicate() != nullptr) { - size = size + GetPredicate()->SerializeSize(); - } - if (Parent()) { - size = size + Parent()->SerializeSize(); - } - - return size; -} - void SeqScanPlan::SetParameterValues(std::vector *values) { LOG_TRACE("Setting parameter values in Sequential Scan"); From 7e61425e321bc744e6a536f0ed4d54198342fb02 Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Mon, 30 Apr 2018 00:36:17 -0400 Subject: [PATCH 06/42] Codegen can now have constant generic/opaque bytes in module --- src/codegen/codegen.cpp | 38 +++++++++++++++++++------------ src/codegen/type/boolean_type.cpp | 4 ++-- src/include/codegen/codegen.h | 8 ++++--- 3 files changed, 31 insertions(+), 19 deletions(-) diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp index 251a4edd8bf..b6449ae4138 100644 --- a/src/codegen/codegen.cpp +++ b/src/codegen/codegen.cpp @@ -59,12 +59,30 @@ llvm::Constant *CodeGen::ConstDouble(double val) const { return llvm::ConstantFP::get(DoubleType(), val); } -llvm::Constant *CodeGen::ConstString(const std::string &s) const { +llvm::Value *CodeGen::ConstString(const std::string &str_val, + const std::string &name) const { // Strings are treated as arrays of bytes - auto *str = llvm::ConstantDataArray::getString(GetContext(), s); - return new llvm::GlobalVariable(GetModule(), str->getType(), true, - llvm::GlobalValue::InternalLinkage, str, - "str"); + auto *str = llvm::ConstantDataArray::getString(GetContext(), str_val); + auto *global_var = + new llvm::GlobalVariable(GetModule(), str->getType(), true, + llvm::GlobalValue::InternalLinkage, str, name); + return GetBuilder().CreateInBoundsGEP(global_var, {Const32(0), Const32(0)}); +} + +llvm::Value *CodeGen::ConstGenericBytes(llvm::Type *type, const void *data, + uint32_t length, + const std::string &name) const { + // Create the constant data array that wraps the input data + llvm::ArrayRef elements{reinterpret_cast(data), + length}; + auto *arr = llvm::ConstantDataArray::get(GetContext(), elements); + + // Create a global variable for the data + auto *global_var = new llvm::GlobalVariable( + GetModule(), type, true, llvm::GlobalValue::InternalLinkage, arr, name); + + // Return a pointer to the first element + return GetBuilder().CreateInBoundsGEP(global_var, {Const32(0), Const32(0)}); } llvm::Constant *CodeGen::Null(llvm::Type *type) const { @@ -75,11 +93,6 @@ llvm::Constant *CodeGen::NullPtr(llvm::PointerType *type) const { return llvm::ConstantPointerNull::get(type); } -llvm::Value *CodeGen::ConstStringPtr(const std::string &s) const { - auto &ir_builder = GetBuilder(); - return ir_builder.CreateConstInBoundsGEP2_32(nullptr, ConstString(s), 0, 0); -} - llvm::Value *CodeGen::AllocateVariable(llvm::Type *type, const std::string &name) { // To allocate a variable, a function must be under construction @@ -143,12 +156,9 @@ llvm::Value *CodeGen::CallPrintf(const std::string &format, "printf", llvm::TypeBuilder::get(GetContext()), reinterpret_cast(printf)); } - auto &ir_builder = code_context_.GetBuilder(); - auto *format_str = - ir_builder.CreateGEP(ConstString(format), {Const32(0), Const32(0)}); // Collect all the arguments into a vector - std::vector printf_args{format_str}; + std::vector printf_args = {ConstString(format, "format")}; printf_args.insert(printf_args.end(), args.begin(), args.end()); // Call the function diff --git a/src/codegen/type/boolean_type.cpp b/src/codegen/type/boolean_type.cpp index 2580e210d4b..edc761d8179 100644 --- a/src/codegen/type/boolean_type.cpp +++ b/src/codegen/type/boolean_type.cpp @@ -84,7 +84,8 @@ struct CastBooleanToVarchar : public TypeSystem::CastHandleNull { // Convert this boolean (unsigned int) into a string llvm::Value *str_val = codegen->CreateSelect( - value.GetValue(), codegen.ConstString("T"), codegen.ConstString("F")); + value.GetValue(), codegen.ConstString("T", "true"), + codegen.ConstString("F", "false")); // We could be casting this non-nullable value to a nullable type llvm::Value *null = to_type.nullable ? codegen.ConstBool(false) : nullptr; @@ -250,7 +251,6 @@ struct LogicalOr : public TypeSystem::BinaryOperatorHandleNull { std::vector kImplicitCastingTable = { peloton::type::TypeId::BOOLEAN}; - // Explicit casts CastBooleanToInteger kBooleanToInteger; CastBooleanToDecimal kBooleanToDecimal; diff --git a/src/include/codegen/codegen.h b/src/include/codegen/codegen.h index 5612868d0d5..09edae81900 100644 --- a/src/include/codegen/codegen.h +++ b/src/include/codegen/codegen.h @@ -95,11 +95,13 @@ class CodeGen { llvm::Constant *Const32(int32_t val) const; llvm::Constant *Const64(int64_t val) const; llvm::Constant *ConstDouble(double val) const; - llvm::Constant *ConstString(const std::string &s) const; + llvm::Value *ConstString(const std::string &str_val, + const std::string &name) const; + llvm::Value *ConstGenericBytes(llvm::Type *type, const void *data, + uint32_t length, + const std::string &name) const; llvm::Constant *Null(llvm::Type *type) const; llvm::Constant *NullPtr(llvm::PointerType *type) const; - /// Wrapper for pointer for constant string - llvm::Value *ConstStringPtr(const std::string &s) const; llvm::Value *AllocateVariable(llvm::Type *type, const std::string &name); llvm::Value *AllocateBuffer(llvm::Type *element_type, uint32_t num_elems, From 02bd504f20d6dba942f78ec0d3ef88dc888f2295 Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Mon, 30 Apr 2018 00:39:54 -0400 Subject: [PATCH 07/42] When no columns specified during copy, all columns are inserted --- src/binder/bind_node_visitor.cpp | 4 +- src/include/optimizer/input_column_deriver.h | 2 + src/include/optimizer/optimizer.h | 20 +--- src/include/optimizer/util.h | 1 - src/include/parser/copy_statement.h | 8 ++ src/include/planner/abstract_scan_plan.h | 2 +- src/include/planner/csv_scan_plan.h | 107 +++++++++++++++++-- src/optimizer/input_column_deriver.cpp | 2 + src/optimizer/optimizer.cpp | 19 +++- src/optimizer/plan_generator.cpp | 13 ++- 10 files changed, 148 insertions(+), 30 deletions(-) diff --git a/src/binder/bind_node_visitor.cpp b/src/binder/bind_node_visitor.cpp index c7d25093beb..eec8a03c091 100644 --- a/src/binder/bind_node_visitor.cpp +++ b/src/binder/bind_node_visitor.cpp @@ -168,10 +168,12 @@ void BindNodeVisitor::Visit(parser::DeleteStatement *node) { void BindNodeVisitor::Visit(parser::LimitDescription *) {} void BindNodeVisitor::Visit(parser::CopyStatement *node) { - // Bind the source/target table of the copy context_ = std::make_shared(nullptr); if (node->table != nullptr) { node->table->Accept(this); + + // If the table is given, we're either writing or reading all columns + context_->GenerateAllColumnExpressions(node->select_list); } else { node->select_stmt->Accept(this); } diff --git a/src/include/optimizer/input_column_deriver.h b/src/include/optimizer/input_column_deriver.h index fa1ec6ca5a1..728a08305c4 100644 --- a/src/include/optimizer/input_column_deriver.h +++ b/src/include/optimizer/input_column_deriver.h @@ -53,6 +53,8 @@ class InputColumnDeriver : public OperatorVisitor { void Visit(const PhysicalIndexScan *op) override; + void Visit(const ExternalFileScan *op) override; + void Visit(const QueryDerivedScan *op) override; void Visit(const PhysicalOrderBy *) override; diff --git a/src/include/optimizer/optimizer.h b/src/include/optimizer/optimizer.h index 71d7afca265..18608c06756 100644 --- a/src/include/optimizer/optimizer.h +++ b/src/include/optimizer/optimizer.h @@ -23,24 +23,24 @@ namespace peloton { namespace parser { class SQLStatementList; class SQLStatement; -} +} // namespace parser namespace planner { class AbstractPlan; -}; +} // namespace planner namespace optimizer { class OperatorExpression; -} +} // namespace optimizer namespace concurrency { class TransactionContext; -} +} // namespace concurrency namespace test { class OptimizerRuleTests_SimpleAssociativeRuleTest_Test; class OptimizerRuleTests_SimpleAssociativeRuleTest2_Test; -} +} // namespace test namespace optimizer { @@ -110,16 +110,6 @@ class Optimizer : public AbstractOptimizer { parser::SQLStatement *tree, bool &is_ddl_stmt, concurrency::TransactionContext *txn); - /** - * Construct a plan object for the given parsed copy statement. - * - * @param copy_stmt The copy statement we're transforming - * @param txn The transactional context - * @return The construct plan object for the COPY statement - */ - std::unique_ptr HandleDDLCopyStatement( - parser::CopyStatement *copy_stmt, concurrency::TransactionContext *txn); - /* TransformQueryTree - create an initial operator tree for the given query * to be used in performing optimization. * diff --git a/src/include/optimizer/util.h b/src/include/optimizer/util.h index 877bdee3b96..634e1297347 100644 --- a/src/include/optimizer/util.h +++ b/src/include/optimizer/util.h @@ -17,7 +17,6 @@ #include #include "expression/abstract_expression.h" -#include "parser/copy_statement.h" #include "planner/abstract_plan.h" namespace peloton { diff --git a/src/include/parser/copy_statement.h b/src/include/parser/copy_statement.h index 92100e312d1..8145cd695e9 100644 --- a/src/include/parser/copy_statement.h +++ b/src/include/parser/copy_statement.h @@ -46,14 +46,22 @@ class CopyStatement : public SQLStatement { /// ////////////////////////////////////////////////////////////////////////////// + // The table that is copied into or copied from std::unique_ptr table; + // The SQL statement used instead of a table when copying data out to a file std::unique_ptr select_stmt; + // The set of attributes being written out or read in + std::vector> select_list; + + // The type of copy CopyType type; + // The input or output file that is read of written into std::string file_path; + // The format of the file ExternalFileFormat format; bool is_from; diff --git a/src/include/planner/abstract_scan_plan.h b/src/include/planner/abstract_scan_plan.h index 099bf5a161b..b770d66b7fe 100644 --- a/src/include/planner/abstract_scan_plan.h +++ b/src/include/planner/abstract_scan_plan.h @@ -56,7 +56,7 @@ class AbstractScan : public AbstractPlan { storage::DataTable *GetTable() const { return target_table_; } - void GetAttributes(std::vector &ais) const { + virtual void GetAttributes(std::vector &ais) const { for (const auto &ai : attributes_) { ais.push_back(&ai); } diff --git a/src/include/planner/csv_scan_plan.h b/src/include/planner/csv_scan_plan.h index e871b15ac02..1c14a1d9ece 100644 --- a/src/include/planner/csv_scan_plan.h +++ b/src/include/planner/csv_scan_plan.h @@ -12,23 +12,63 @@ #pragma once -#include "planner/abstract_plan.h" +#include + +#include "codegen/type/type.h" +#include "planner/abstract_scan_plan.h" +#include "planner/attribute_info.h" namespace peloton { namespace planner { -class CSVScanPlan : public AbstractPlan { +class CSVScanPlan : public AbstractScan { public: - CSVScanPlan(std::string file_name) : file_name_(std::move(file_name)) {} + struct ColumnInfo { + std::string name; + type::TypeId type; + }; - PlanNodeType GetPlanNodeType() const override { - return PlanNodeType::CSVSCAN; - } + public: + /** + * Constructs a sequential scan over a CSV file + * + * @param file_name The file path + * @param cols Information of the columns expected in each row of the CSV + */ + CSVScanPlan(std::string file_name, std::vector &&cols); + + ////////////////////////////////////////////////////////////////////////////// + /// + /// Accessors + /// + ////////////////////////////////////////////////////////////////////////////// + + PlanNodeType GetPlanNodeType() const override; + + void GetOutputColumns(std::vector &columns) const override; + + const std::string &GetFileName() const { return file_name_; } + + void GetAttributes(std::vector &ais) const override; + + ////////////////////////////////////////////////////////////////////////////// + /// + /// Utilities + Internal + /// + ////////////////////////////////////////////////////////////////////////////// + + hash_t Hash() const override; + + bool operator==(const AbstractPlan &rhs) const override; std::unique_ptr Copy() const override; + void PerformBinding(BindingContext &binding_context) override; + private: const std::string file_name_; + + std::vector> attributes_; }; //////////////////////////////////////////////////////////////////////////////// @@ -37,9 +77,60 @@ class CSVScanPlan : public AbstractPlan { /// //////////////////////////////////////////////////////////////////////////////// +inline CSVScanPlan::CSVScanPlan(std::string file_name, + std::vector &&cols) + : file_name_(std::move(file_name)) { + for (const auto &col : cols) { + std::unique_ptr attribute{ + new planner::AttributeInfo()}; + attribute->name = col.name; + attribute->type = codegen::type::Type{col.type, true}; + attributes_.emplace_back(std::move(attribute)); + } +} + +inline PlanNodeType CSVScanPlan::GetPlanNodeType() const { + return PlanNodeType::CSVSCAN; +} + inline std::unique_ptr CSVScanPlan::Copy() const { - // TODO - return std::unique_ptr(); + std::vector new_cols; + for (const auto &attribute : attributes_) { + new_cols.push_back(CSVScanPlan::ColumnInfo{ + .name = attribute->name, .type = attribute->type.type_id}); + } + return std::unique_ptr( + new CSVScanPlan(file_name_, std::move(new_cols))); +} + +inline void CSVScanPlan::PerformBinding(BindingContext &binding_context) { + for (uint32_t i = 0; i < attributes_.size(); i++) { + binding_context.BindNew(i, attributes_[i].get()); + } +} + +inline void CSVScanPlan::GetOutputColumns(std::vector &columns) const { + columns.clear(); + columns.resize(attributes_.size()); + std::iota(columns.begin(), columns.end(), 0); +} + +inline hash_t CSVScanPlan::Hash() const { + return HashUtil::HashBytes(file_name_.data(), file_name_.length()); +} + +inline bool CSVScanPlan::operator==(const AbstractPlan &rhs) const { + if (rhs.GetPlanNodeType() != PlanNodeType::CSVSCAN) return false; + const auto &other = static_cast(rhs); + return StringUtil::Upper(file_name_) == StringUtil::Upper(other.file_name_); +} + +inline void CSVScanPlan::GetAttributes( + std::vector &ais) const { + ais.clear(); + for (const auto &ai : attributes_) { + ais.push_back(ai.get()); + } } } // namespace planner diff --git a/src/optimizer/input_column_deriver.cpp b/src/optimizer/input_column_deriver.cpp index 7819f81afb9..08d7c54a4ae 100644 --- a/src/optimizer/input_column_deriver.cpp +++ b/src/optimizer/input_column_deriver.cpp @@ -55,6 +55,8 @@ void InputColumnDeriver::Visit(const PhysicalSeqScan *) { ScanHelper(); } void InputColumnDeriver::Visit(const PhysicalIndexScan *) { ScanHelper(); } +void InputColumnDeriver::Visit(const ExternalFileScan *) { ScanHelper(); } + void InputColumnDeriver::Visit(const QueryDerivedScan *op) { // QueryDerivedScan should only be a renaming layer ExprMap output_cols_map; diff --git a/src/optimizer/optimizer.cpp b/src/optimizer/optimizer.cpp index 5722034ef7d..2525915fcc1 100644 --- a/src/optimizer/optimizer.cpp +++ b/src/optimizer/optimizer.cpp @@ -259,20 +259,33 @@ QueryInfo Optimizer::GetQueryInfo(parser::SQLStatement *tree) { std::shared_ptr physical_props = std::make_shared(); switch (tree->GetType()) { case StatementType::SELECT: { - auto select = reinterpret_cast(tree); + auto *select = reinterpret_cast(tree); GetQueryInfoHelper(select->select_list, select->order, output_exprs, physical_props); break; } case StatementType::INSERT: { - auto insert = reinterpret_cast(tree); + auto *insert = reinterpret_cast(tree); if (insert->select != nullptr) GetQueryInfoHelper(insert->select->select_list, insert->select->order, output_exprs, physical_props); break; } + case StatementType::COPY: { + auto *copy = reinterpret_cast(tree); + if (copy->select_stmt != nullptr) { + GetQueryInfoHelper(copy->select_stmt->select_list, + copy->select_stmt->order, output_exprs, + physical_props); + } else { + std::unique_ptr order; + GetQueryInfoHelper(copy->select_list, order, output_exprs, + physical_props); + } + break; + } default: - ; + break; } return QueryInfo(output_exprs, physical_props); diff --git a/src/optimizer/plan_generator.cpp b/src/optimizer/plan_generator.cpp index 6960e4ddd25..804184b6246 100644 --- a/src/optimizer/plan_generator.cpp +++ b/src/optimizer/plan_generator.cpp @@ -15,6 +15,7 @@ #include "catalog/column_catalog.h" #include "catalog/index_catalog.h" #include "catalog/table_catalog.h" +#include "codegen/type/type.h" #include "concurrency/transaction_context.h" #include "expression/expression_util.h" #include "optimizer/operator_expression.h" @@ -131,7 +132,17 @@ void PlanGenerator::Visit(const PhysicalIndexScan *op) { void PlanGenerator::Visit(const ExternalFileScan *op) { switch (op->format) { case ExternalFileFormat::CSV: { - output_plan_.reset(new planner::CSVScanPlan(op->file_name)); + // First construct the output column descriptions + std::vector cols; + for (const auto *output_col : output_cols_) { + auto col_info = planner::CSVScanPlan::ColumnInfo{ + .name = "", .type = output_col->GetValueType()}; + cols.emplace_back(std::move(col_info)); + } + + // Create the plan + output_plan_.reset( + new planner::CSVScanPlan(op->file_name, std::move(cols))); break; } } From 226d341d62d8cbbfd0015db4d5b4fcba0d4d6b20 Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Mon, 30 Apr 2018 00:41:06 -0400 Subject: [PATCH 08/42] Added function to throw expception with ill-formatted input string when converting to number --- src/codegen/runtime_functions.cpp | 4 ++++ src/include/codegen/runtime_functions.h | 10 +++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/codegen/runtime_functions.cpp b/src/codegen/runtime_functions.cpp index 23dc1eec6c6..2bebfc150a6 100644 --- a/src/codegen/runtime_functions.cpp +++ b/src/codegen/runtime_functions.cpp @@ -255,5 +255,9 @@ void RuntimeFunctions::ThrowOverflowException() { throw std::overflow_error("ERROR: overflow"); } +void RuntimeFunctions::ThrowInvalidInputStringException() { + throw std::runtime_error("ERROR: invalid input string"); +} + } // namespace codegen } // namespace peloton \ No newline at end of file diff --git a/src/include/codegen/runtime_functions.h b/src/include/codegen/runtime_functions.h index 13712188be4..4438bce31b7 100644 --- a/src/include/codegen/runtime_functions.h +++ b/src/include/codegen/runtime_functions.h @@ -77,7 +77,7 @@ class RuntimeFunctions { */ static void GetTileGroupLayout(const storage::TileGroup *tile_group, ColumnLayoutInfo *infos, uint32_t num_cols); - + /** * Execute a parallel scan over the given table in the given database. * @@ -106,6 +106,12 @@ class RuntimeFunctions { void *query_state, executor::ExecutorContext::ThreadStates &thread_states, void (*work_func)(void *, void *)); + ////////////////////////////////////////////////////////////////////////////// + /// + /// Exception related functions + /// + ////////////////////////////////////////////////////////////////////////////// + /** * Throw a divide-by-zero exception. This function doesn't return. */ @@ -115,6 +121,8 @@ class RuntimeFunctions { * Throw a mathematical overflow exception. This function does not return. */ static void ThrowOverflowException(); + + static void ThrowInvalidInputStringException(); }; } // namespace codegen From e9e1a8f33867a43b18457e13e01dfc4edfab755a Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Mon, 30 Apr 2018 00:41:28 -0400 Subject: [PATCH 09/42] Removed serialization --- src/network/service/peloton_service.cpp | 4 ++-- test/network/rpc_queryplan_test.cpp | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/network/service/peloton_service.cpp b/src/network/service/peloton_service.cpp index 9e5095a0916..90a5b81ee8f 100644 --- a/src/network/service/peloton_service.cpp +++ b/src/network/service/peloton_service.cpp @@ -357,7 +357,7 @@ void PelotonService::QueryPlan(::google::protobuf::RpcController *controller, LOG_ERROR("Queryplan recived desen't have type"); break; } - +#if 0 case PlanNodeType::SEQSCAN: { LOG_TRACE("SEQSCAN revieved"); std::string plan = request->plan(); @@ -400,7 +400,7 @@ void PelotonService::QueryPlan(::google::protobuf::RpcController *controller, break; } - +#endif default: { LOG_ERROR("Queryplan recived :: Unsupported TYPE: %s", PlanNodeTypeToString(plan_type).c_str()); diff --git a/test/network/rpc_queryplan_test.cpp b/test/network/rpc_queryplan_test.cpp index cb11891a1db..90b55e06668 100644 --- a/test/network/rpc_queryplan_test.cpp +++ b/test/network/rpc_queryplan_test.cpp @@ -22,6 +22,7 @@ namespace test { class RpcQueryPlanTests : public PelotonTest {}; TEST_F(RpcQueryPlanTests, BasicTest) { +#if 0 peloton::planner::SeqScanPlan mapped_plan_ptr; const peloton::PlanNodeType type = mapped_plan_ptr.GetPlanNodeType(); @@ -32,6 +33,7 @@ TEST_F(RpcQueryPlanTests, BasicTest) { bool serialize = mapped_plan_ptr.SerializeTo(output_plan); // Becuase the plan is not completed, so it is false EXPECT_FALSE(serialize); +#endif } } } From 0847d23abb383b1dd5426b3c08fb999e45e36f3a Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Mon, 30 Apr 2018 00:43:29 -0400 Subject: [PATCH 10/42] Added input functions in prepartion to read table data from files --- src/codegen/proxy/runtime_functions_proxy.cpp | 2 + src/codegen/proxy/values_runtime_proxy.cpp | 7 + src/codegen/values_runtime.cpp | 223 ++++++++++++++++-- .../codegen/proxy/runtime_functions_proxy.h | 6 + .../codegen/proxy/values_runtime_proxy.h | 7 + src/include/codegen/values_runtime.h | 35 +++ 6 files changed, 254 insertions(+), 26 deletions(-) diff --git a/src/codegen/proxy/runtime_functions_proxy.cpp b/src/codegen/proxy/runtime_functions_proxy.cpp index b406a50fcca..652d1ba2e08 100644 --- a/src/codegen/proxy/runtime_functions_proxy.cpp +++ b/src/codegen/proxy/runtime_functions_proxy.cpp @@ -26,6 +26,8 @@ DEFINE_TYPE(ColumnLayoutInfo, "peloton::ColumnLayoutInfo", col_start_ptr, DEFINE_TYPE(AbstractExpression, "peloton::expression::AbstractExpression", opaque); +DEFINE_TYPE(Type, "peloton::Type", opaque); + DEFINE_METHOD(peloton::codegen, RuntimeFunctions, HashMurmur3); DEFINE_METHOD(peloton::codegen, RuntimeFunctions, HashCrc64); DEFINE_METHOD(peloton::codegen, RuntimeFunctions, GetTileGroup); diff --git a/src/codegen/proxy/values_runtime_proxy.cpp b/src/codegen/proxy/values_runtime_proxy.cpp index 85f866e74f8..e8dd45d10bc 100644 --- a/src/codegen/proxy/values_runtime_proxy.cpp +++ b/src/codegen/proxy/values_runtime_proxy.cpp @@ -27,6 +27,13 @@ DEFINE_METHOD(peloton::codegen, ValuesRuntime, OutputTimestamp); DEFINE_METHOD(peloton::codegen, ValuesRuntime, OutputDecimal); DEFINE_METHOD(peloton::codegen, ValuesRuntime, OutputVarchar); DEFINE_METHOD(peloton::codegen, ValuesRuntime, OutputVarbinary); + +DEFINE_METHOD(peloton::codegen, ValuesRuntime, InputBoolean); +DEFINE_METHOD(peloton::codegen, ValuesRuntime, InputTinyInt); +DEFINE_METHOD(peloton::codegen, ValuesRuntime, InputSmallInt); +DEFINE_METHOD(peloton::codegen, ValuesRuntime, InputInteger); +DEFINE_METHOD(peloton::codegen, ValuesRuntime, InputBigInt); + DEFINE_METHOD(peloton::codegen, ValuesRuntime, CompareStrings); } // namespace codegen diff --git a/src/codegen/values_runtime.cpp b/src/codegen/values_runtime.cpp index 461d6f8faf7..33977174925 100644 --- a/src/codegen/values_runtime.cpp +++ b/src/codegen/values_runtime.cpp @@ -6,12 +6,16 @@ // // Identification: src/codegen/values_runtime.cpp // -// Copyright (c) 2015-2017, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// #include "codegen/values_runtime.h" +#include + +#include "codegen/runtime_functions.h" +#include "codegen/type/type.h" #include "type/value.h" #include "type/type_util.h" #include "type/value_factory.h" @@ -22,75 +26,242 @@ namespace codegen { namespace { -inline void SetValue(type::Value *val_ptr, type::Value &&val) { - new (val_ptr) type::Value(val); +inline void SetValue(peloton::type::Value *val_ptr, + peloton::type::Value &&val) { + new (val_ptr) peloton::type::Value(val); } } // namespace void ValuesRuntime::OutputBoolean(char *values, uint32_t idx, bool val, bool is_null) { - auto *vals = reinterpret_cast(values); + auto *vals = reinterpret_cast(values); if (is_null) { - SetValue(&vals[idx], - type::ValueFactory::GetNullValueByType(type::TypeId::BOOLEAN)); + SetValue(&vals[idx], peloton::type::ValueFactory::GetNullValueByType( + peloton::type::TypeId::BOOLEAN)); } else { - SetValue(&vals[idx], type::ValueFactory::GetBooleanValue(val)); + SetValue(&vals[idx], peloton::type::ValueFactory::GetBooleanValue(val)); } } void ValuesRuntime::OutputTinyInt(char *values, uint32_t idx, int8_t val) { - auto *vals = reinterpret_cast(values); - SetValue(&vals[idx], type::ValueFactory::GetTinyIntValue(val)); + auto *vals = reinterpret_cast(values); + SetValue(&vals[idx], peloton::type::ValueFactory::GetTinyIntValue(val)); } void ValuesRuntime::OutputSmallInt(char *values, uint32_t idx, int16_t val) { - auto *vals = reinterpret_cast(values); - SetValue(&vals[idx], type::ValueFactory::GetSmallIntValue(val)); + auto *vals = reinterpret_cast(values); + SetValue(&vals[idx], peloton::type::ValueFactory::GetSmallIntValue(val)); } void ValuesRuntime::OutputInteger(char *values, uint32_t idx, int32_t val) { - auto *vals = reinterpret_cast(values); - SetValue(&vals[idx], type::ValueFactory::GetIntegerValue(val)); + auto *vals = reinterpret_cast(values); + SetValue(&vals[idx], peloton::type::ValueFactory::GetIntegerValue(val)); } void ValuesRuntime::OutputBigInt(char *values, uint32_t idx, int64_t val) { - auto *vals = reinterpret_cast(values); - SetValue(&vals[idx], type::ValueFactory::GetBigIntValue(val)); + auto *vals = reinterpret_cast(values); + SetValue(&vals[idx], peloton::type::ValueFactory::GetBigIntValue(val)); } void ValuesRuntime::OutputDate(char *values, uint32_t idx, int32_t val) { - auto *vals = reinterpret_cast(values); - SetValue(&vals[idx], type::ValueFactory::GetDateValue(val)); + auto *vals = reinterpret_cast(values); + SetValue(&vals[idx], peloton::type::ValueFactory::GetDateValue(val)); } void ValuesRuntime::OutputTimestamp(char *values, uint32_t idx, int64_t val) { - auto *vals = reinterpret_cast(values); - SetValue(&vals[idx], type::ValueFactory::GetTimestampValue(val)); + auto *vals = reinterpret_cast(values); + SetValue(&vals[idx], peloton::type::ValueFactory::GetTimestampValue(val)); } void ValuesRuntime::OutputDecimal(char *values, uint32_t idx, double val) { - auto *vals = reinterpret_cast(values); - SetValue(&vals[idx], type::ValueFactory::GetDecimalValue(val)); + auto *vals = reinterpret_cast(values); + SetValue(&vals[idx], peloton::type::ValueFactory::GetDecimalValue(val)); } void ValuesRuntime::OutputVarchar(char *values, uint32_t idx, const char *str, uint32_t len) { - auto *vals = reinterpret_cast(values); - SetValue(&vals[idx], type::ValueFactory::GetVarcharValue(str, len, false)); + auto *vals = reinterpret_cast(values); + SetValue(&vals[idx], + peloton::type::ValueFactory::GetVarcharValue(str, len, false)); } void ValuesRuntime::OutputVarbinary(char *values, uint32_t idx, const char *ptr, uint32_t len) { - auto *vals = reinterpret_cast(values); + auto *vals = reinterpret_cast(values); const auto *bin_ptr = reinterpret_cast(ptr); SetValue(&vals[idx], - type::ValueFactory::GetVarbinaryValue(bin_ptr, len, false)); + peloton::type::ValueFactory::GetVarbinaryValue(bin_ptr, len, false)); +} + +namespace { + +void TrimLeftRight(char *&left, char *&right) { + while (*left == ' ') { + left++; + } + while (*right == ' ') { + right++; + } +} + +template +typename std::enable_if::value, T>::type ToNum( + char *ptr, uint32_t len) { + char *start = ptr, *end = ptr + len; + if (start == end) { + // ERROR + } + + // Trim whitespace on left and right + TrimLeftRight(start, end); + + // Check negative or positive sign + bool negative = false; + if (*start == '-') { + negative = true; + start++; + } else if (*start == '+') { + start++; + } + + int64_t num = 0; + while (start != end) { + if (*start < '0' || *start > '9') { + RuntimeFunctions::ThrowInvalidInputStringException(); + } + + num = (num * 10) + (*start - '0'); + + start++; + } + + if (negative) { + num = -num; + } + + if (num <= std::numeric_limits::min() || + num >= std::numeric_limits::max()) { + RuntimeFunctions::ThrowOverflowException(); + } + + return static_cast(num); +} + +} // namespace + +bool ValuesRuntime::InputBoolean(UNUSED_ATTRIBUTE const type::Type &type, + char *ptr, uint32_t len) { + PELOTON_ASSERT(ptr != nullptr && "Input is assumed to be non-NULL"); + PELOTON_ASSERT(len != 0 && "Length must be non-zero"); + + char *start = ptr, *end = ptr + len; + + // Trim whitespace on both ends + TrimLeftRight(start, end); + + // + uint64_t trimmed_len = end - start; + + // Check cases + switch (*start) { + case 't': + case 'T': { + static constexpr char kTrue[] = "true"; + if (strncasecmp(start, kTrue, std::min(trimmed_len, sizeof(kTrue)))) { + return true; + } + break; + } + case 'f': + case 'F': { + static constexpr char kFalse[] = "false"; + if (strncasecmp(start, kFalse, std::min(trimmed_len, sizeof(kFalse)))) { + return false; + } + break; + } + case 'y': + case 'Y': { + static constexpr char kYes[] = "yes"; + if (strncasecmp(start, kYes, std::min(trimmed_len, sizeof(kYes)))) { + return true; + } + break; + } + case 'n': + case 'N': { + static constexpr char kNo[] = "no"; + if (strncasecmp(start, kNo, std::min(trimmed_len, sizeof(kNo)))) { + return false; + } + break; + } + case 'o': + case 'O': { + // 'o' not enough to distinguish between on/off + static constexpr char kOff[] = "off"; + static constexpr char kOn[] = "on"; + if (strncasecmp(start, kOff, std::min(trimmed_len, sizeof(kOff)))) { + return false; + } else if (strncasecmp(start, kOn, std::min(trimmed_len, sizeof(kOn)))) { + return true; + } + break; + } + case '0': { + if (trimmed_len == 1) { + return false; + } else { + return true; + } + } + case '1': { + if (trimmed_len == 1) { + return true; + } else { + return false; + } + } + default: { break; } + } + + // Error + RuntimeFunctions::ThrowInvalidInputStringException(); + __builtin_unreachable(); +} + +int8_t ValuesRuntime::InputTinyInt(UNUSED_ATTRIBUTE const type::Type &type, + char *ptr, uint32_t len) { + PELOTON_ASSERT(ptr != nullptr && "Input is assumed to be non-NULL"); + PELOTON_ASSERT(len != 0 && "Length must be non-zero"); + return ToNum(ptr, len); +} + +int16_t ValuesRuntime::InputSmallInt(UNUSED_ATTRIBUTE const type::Type &type, + char *ptr, uint32_t len) { + PELOTON_ASSERT(ptr != nullptr && "Input is assumed to be non-NULL"); + PELOTON_ASSERT(len != 0 && "Length must be non-zero"); + return ToNum(ptr, len); +} + +int32_t ValuesRuntime::InputInteger(UNUSED_ATTRIBUTE const type::Type &type, + char *ptr, uint32_t len) { + PELOTON_ASSERT(ptr != nullptr && "Input is assumed to be non-NULL"); + PELOTON_ASSERT(len != 0 && "Length must be non-zero"); + return ToNum(ptr, len); +} + +int64_t ValuesRuntime::InputBigInt(UNUSED_ATTRIBUTE const type::Type &type, + char *ptr, uint32_t len) { + PELOTON_ASSERT(ptr != nullptr && "Input is assumed to be non-NULL"); + PELOTON_ASSERT(len != 0 && "Length must be non-zero"); + return ToNum(ptr, len); } int32_t ValuesRuntime::CompareStrings(const char *str1, uint32_t len1, const char *str2, uint32_t len2) { - return type::TypeUtil::CompareStrings(str1, len1, str2, len2); + return peloton::type::TypeUtil::CompareStrings(str1, len1, str2, len2); } } // namespace codegen diff --git a/src/include/codegen/proxy/runtime_functions_proxy.h b/src/include/codegen/proxy/runtime_functions_proxy.h index c20ba145eb4..5700f7fffb9 100644 --- a/src/include/codegen/proxy/runtime_functions_proxy.h +++ b/src/include/codegen/proxy/runtime_functions_proxy.h @@ -33,6 +33,11 @@ PROXY(AbstractExpression) { DECLARE_TYPE; }; +PROXY(Type) { + DECLARE_MEMBER(0, char[sizeof(codegen::type::Type)], opaque); + DECLARE_TYPE; +}; + PROXY(RuntimeFunctions) { DECLARE_METHOD(HashMurmur3); DECLARE_METHOD(HashCrc64); @@ -47,6 +52,7 @@ PROXY(RuntimeFunctions) { TYPE_BUILDER(ColumnLayoutInfo, codegen::RuntimeFunctions::ColumnLayoutInfo); TYPE_BUILDER(AbstractExpression, expression::AbstractExpression); +TYPE_BUILDER(Type, codegen::type::Type); } // namespace codegen } // namespace peloton \ No newline at end of file diff --git a/src/include/codegen/proxy/values_runtime_proxy.h b/src/include/codegen/proxy/values_runtime_proxy.h index e74954a999a..77f78979572 100644 --- a/src/include/codegen/proxy/values_runtime_proxy.h +++ b/src/include/codegen/proxy/values_runtime_proxy.h @@ -29,6 +29,13 @@ PROXY(ValuesRuntime) { DECLARE_METHOD(OutputDecimal); DECLARE_METHOD(OutputVarchar); DECLARE_METHOD(OutputVarbinary); + + DECLARE_MEMBER(InputBoolean); + DECLARE_MEMBER(InputTinyInt); + DECLARE_MEMBER(InputSmallInt); + DECLARE_MEMBER(InputInteger); + DECLARE_MEMBER(InputBigInt); + DECLARE_METHOD(CompareStrings); }; diff --git a/src/include/codegen/values_runtime.h b/src/include/codegen/values_runtime.h index e6cf4967ca2..e37396e5aa8 100644 --- a/src/include/codegen/values_runtime.h +++ b/src/include/codegen/values_runtime.h @@ -17,8 +17,18 @@ namespace peloton { namespace codegen { +namespace type { +class Type; +} // namespace type + class ValuesRuntime { public: + ////////////////////////////////////////////////////////////////////////////// + /// + /// Output functions + /// + ////////////////////////////////////////////////////////////////////////////// + // Write out the given boolean value into the array at the provided index static void OutputBoolean(char *values, uint32_t idx, bool val, bool is_null); @@ -51,6 +61,31 @@ class ValuesRuntime { static void OutputVarbinary(char *values, uint32_t idx, const char *str, uint32_t len); + ////////////////////////////////////////////////////////////////////////////// + /// + /// Input functions + //// + ////////////////////////////////////////////////////////////////////////////// + + static bool InputBoolean(const type::Type &type, char *ptr, uint32_t len); + + static int8_t InputTinyInt(const type::Type &type, char *ptr, uint32_t len); + + static int16_t InputSmallInt(const type::Type &type, char *ptr, uint32_t len); + + static int32_t InputInteger(const type::Type &type, char *ptr, uint32_t len); + + static int64_t InputBigInt(const type::Type &type, char *ptr, uint32_t len); + + /** + * Compare two strings, returning an integer value indicating their sort order + * + * @param str1 A pointer to the first string + * @param len1 The length of the first string + * @param str2 A pointer to the second string + * @param len2 The length of the second string + * @return + */ static int32_t CompareStrings(const char *str1, uint32_t len1, const char *str2, uint32_t len2); }; From 1d9e33430693016078f3080dd3e5c3469eeb2859 Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Mon, 30 Apr 2018 09:26:01 -0400 Subject: [PATCH 11/42] All SQL types must now provide an input function to convert a string into a SQL type --- src/codegen/proxy/values_runtime_proxy.cpp | 1 + src/codegen/type/array_type.cpp | 11 +++-- src/codegen/type/bigint_type.cpp | 37 +++++++-------- src/codegen/type/boolean_type.cpp | 16 ++++--- src/codegen/type/date_type.cpp | 11 ++++- src/codegen/type/decimal_type.cpp | 30 +++++++------ src/codegen/type/integer_type.cpp | 45 +++++++++---------- src/codegen/type/smallint_type.cpp | 39 ++++++++-------- src/codegen/type/sql_type.cpp | 6 +++ src/codegen/type/timestamp_type.cpp | 11 ++++- src/codegen/type/tinyint_type.cpp | 25 ++++++----- src/codegen/type/varbinary_type.cpp | 11 +++-- src/codegen/type/varchar_type.cpp | 22 +++++---- src/codegen/values_runtime.cpp | 39 +++++++++++++++- .../codegen/proxy/values_runtime_proxy.h | 10 ++--- src/include/codegen/type/array_type.h | 3 ++ src/include/codegen/type/bigint_type.h | 3 ++ src/include/codegen/type/boolean_type.h | 3 ++ src/include/codegen/type/date_type.h | 3 ++ src/include/codegen/type/decimal_type.h | 3 ++ src/include/codegen/type/integer_type.h | 3 ++ src/include/codegen/type/smallint_type.h | 3 ++ src/include/codegen/type/sql_type.h | 2 + src/include/codegen/type/timestamp_type.h | 3 ++ src/include/codegen/type/tinyint_type.h | 3 ++ src/include/codegen/type/varbinary_type.h | 3 ++ src/include/codegen/type/varchar_type.h | 3 ++ 27 files changed, 231 insertions(+), 118 deletions(-) diff --git a/src/codegen/proxy/values_runtime_proxy.cpp b/src/codegen/proxy/values_runtime_proxy.cpp index e8dd45d10bc..37f90834362 100644 --- a/src/codegen/proxy/values_runtime_proxy.cpp +++ b/src/codegen/proxy/values_runtime_proxy.cpp @@ -13,6 +13,7 @@ #include "codegen/proxy/values_runtime_proxy.h" #include "codegen/proxy/value_proxy.h" +#include "codegen/proxy/runtime_functions_proxy.h" namespace peloton { namespace codegen { diff --git a/src/codegen/type/array_type.cpp b/src/codegen/type/array_type.cpp index b99daa2a4ac..f9e6e49a677 100644 --- a/src/codegen/type/array_type.cpp +++ b/src/codegen/type/array_type.cpp @@ -61,9 +61,8 @@ static std::vector kNoArgOperatorTable = {}; Array::Array() : SqlType(peloton::type::TypeId::ARRAY), type_system_(kImplicitCastingTable, kExplicitCastingTable, - kComparisonTable, kUnaryOperatorTable, - kBinaryOperatorTable, kNaryOperatorTable, - kNoArgOperatorTable) {} + kComparisonTable, kUnaryOperatorTable, kBinaryOperatorTable, + kNaryOperatorTable, kNoArgOperatorTable) {} Value Array::GetMinValue(UNUSED_ATTRIBUTE CodeGen &codegen) const { throw Exception{"Arrays don't have minimum values ...."}; @@ -86,6 +85,12 @@ void Array::GetTypeForMaterialization( "Arrays currently do not have a materialization format. Fix me."}; } +llvm::Function *Array::GetInputFunction( + UNUSED_ATTRIBUTE CodeGen &codegen, + UNUSED_ATTRIBUTE const Type &type) const { + throw NotImplementedException{"Array's can't be input ... for now ..."}; +} + llvm::Function *Array::GetOutputFunction( UNUSED_ATTRIBUTE CodeGen &codegen, UNUSED_ATTRIBUTE const Type &type) const { diff --git a/src/codegen/type/bigint_type.cpp b/src/codegen/type/bigint_type.cpp index e20e3e0396f..9332bc51fbc 100644 --- a/src/codegen/type/bigint_type.cpp +++ b/src/codegen/type/bigint_type.cpp @@ -190,8 +190,7 @@ struct Abs : public TypeSystem::UnaryOperatorHandleNull { } Value Impl(CodeGen &codegen, const Value &val, - const TypeSystem::InvocationContext &ctx) - const override { + const TypeSystem::InvocationContext &ctx) const override { PELOTON_ASSERT(SupportsType(val.GetType())); // The BigInt subtraction implementation Sub sub; @@ -201,7 +200,8 @@ struct Abs : public TypeSystem::UnaryOperatorHandleNull { // We want: raw_ret = (val < 0 ? 0 - val : val) auto sub_result = sub.Impl(codegen, zero, val, ctx); auto *lt_zero = codegen->CreateICmpSLT(val.GetValue(), zero.GetValue()); - auto *raw_ret = codegen->CreateSelect(lt_zero, sub_result.GetValue(), val.GetValue()); + auto *raw_ret = + codegen->CreateSelect(lt_zero, sub_result.GetValue(), val.GetValue()); return Value{BigInt::Instance(), raw_ret}; } }; @@ -287,7 +287,7 @@ struct Sqrt : public TypeSystem::UnaryOperatorHandleNull { protected: Value Impl(CodeGen &codegen, const Value &val, UNUSED_ATTRIBUTE const TypeSystem::InvocationContext &ctx) - const override { + const override { auto casted = cast.Impl(codegen, val, Decimal::Instance()); auto *raw_ret = codegen.Sqrt(casted.GetValue()); return Value{Decimal::Instance(), raw_ret}; @@ -332,10 +332,9 @@ struct Add : public TypeSystem::BinaryOperatorHandleNull { }; // Subtraction -bool Sub::SupportsTypes(const Type &left_type, - const Type &right_type) const { +bool Sub::SupportsTypes(const Type &left_type, const Type &right_type) const { return left_type.GetSqlType() == BigInt::Instance() && - left_type == right_type; + left_type == right_type; } Type Sub::ResultType(UNUSED_ATTRIBUTE const Type &left_type, @@ -350,7 +349,7 @@ Value Sub::Impl(CodeGen &codegen, const Value &left, const Value &right, // Do subtraction llvm::Value *overflow_bit = nullptr; llvm::Value *result = codegen.CallSubWithOverflow( - left.GetValue(), right.GetValue(), overflow_bit); + left.GetValue(), right.GetValue(), overflow_bit); if (ctx.on_error == OnError::Exception) { codegen.ThrowIfOverflow(overflow_bit); @@ -513,20 +512,17 @@ struct Modulo : public TypeSystem::BinaryOperatorHandleNull { std::vector kImplicitCastingTable = { peloton::type::TypeId::BIGINT, peloton::type::TypeId::DECIMAL}; +// clang-format off // Explicit casts CastBigInt kCastBigInt; std::vector kExplicitCastingTable = { - {peloton::type::TypeId::BIGINT, peloton::type::TypeId::BOOLEAN, - kCastBigInt}, - {peloton::type::TypeId::BIGINT, peloton::type::TypeId::TINYINT, - kCastBigInt}, - {peloton::type::TypeId::BIGINT, peloton::type::TypeId::SMALLINT, - kCastBigInt}, - {peloton::type::TypeId::BIGINT, peloton::type::TypeId::INTEGER, - kCastBigInt}, + {peloton::type::TypeId::BIGINT, peloton::type::TypeId::BOOLEAN, kCastBigInt}, + {peloton::type::TypeId::BIGINT, peloton::type::TypeId::TINYINT, kCastBigInt}, + {peloton::type::TypeId::BIGINT, peloton::type::TypeId::SMALLINT, kCastBigInt}, + {peloton::type::TypeId::BIGINT, peloton::type::TypeId::INTEGER, kCastBigInt}, {peloton::type::TypeId::BIGINT, peloton::type::TypeId::BIGINT, kCastBigInt}, - {peloton::type::TypeId::BIGINT, peloton::type::TypeId::DECIMAL, - kCastBigInt}}; + {peloton::type::TypeId::BIGINT, peloton::type::TypeId::DECIMAL, kCastBigInt}}; +// clang-format on // Comparison operations CompareBigInt kCompareBigInt; @@ -599,6 +595,11 @@ void BigInt::GetTypeForMaterialization(CodeGen &codegen, llvm::Type *&val_type, len_type = nullptr; } +llvm::Function *BigInt::GetInputFunction( + CodeGen &codegen, UNUSED_ATTRIBUTE const Type &type) const { + return ValuesRuntimeProxy::InputBigInt.GetFunction(codegen); +} + llvm::Function *BigInt::GetOutputFunction( CodeGen &codegen, UNUSED_ATTRIBUTE const Type &type) const { return ValuesRuntimeProxy::OutputBigInt.GetFunction(codegen); diff --git a/src/codegen/type/boolean_type.cpp b/src/codegen/type/boolean_type.cpp index edc761d8179..5f7387ed9b4 100644 --- a/src/codegen/type/boolean_type.cpp +++ b/src/codegen/type/boolean_type.cpp @@ -251,17 +251,16 @@ struct LogicalOr : public TypeSystem::BinaryOperatorHandleNull { std::vector kImplicitCastingTable = { peloton::type::TypeId::BOOLEAN}; +// clang-format off // Explicit casts CastBooleanToInteger kBooleanToInteger; CastBooleanToDecimal kBooleanToDecimal; CastBooleanToVarchar kBooleanToVarchar; std::vector kExplicitCastingTable = { - {peloton::type::TypeId::BOOLEAN, peloton::type::TypeId::INTEGER, - kBooleanToInteger}, - {peloton::type::TypeId::BOOLEAN, peloton::type::TypeId::VARCHAR, - kBooleanToVarchar}, - {peloton::type::TypeId::BOOLEAN, peloton::type::TypeId::DECIMAL, - kBooleanToDecimal}}; + {peloton::type::TypeId::BOOLEAN, peloton::type::TypeId::INTEGER, kBooleanToInteger}, + {peloton::type::TypeId::BOOLEAN, peloton::type::TypeId::VARCHAR, kBooleanToVarchar}, + {peloton::type::TypeId::BOOLEAN, peloton::type::TypeId::DECIMAL, kBooleanToDecimal}}; +// clang-format on // Comparison operations CompareBoolean kCompareBoolean; @@ -325,6 +324,11 @@ void Boolean::GetTypeForMaterialization(CodeGen &codegen, llvm::Type *&val_type, len_type = nullptr; } +llvm::Function *Boolean::GetInputFunction( + CodeGen &codegen, UNUSED_ATTRIBUTE const Type &type) const { + return ValuesRuntimeProxy::InputBoolean.GetFunction(codegen); +} + llvm::Function *Boolean::GetOutputFunction( CodeGen &codegen, UNUSED_ATTRIBUTE const Type &type) const { return ValuesRuntimeProxy::OutputBoolean.GetFunction(codegen); diff --git a/src/codegen/type/date_type.cpp b/src/codegen/type/date_type.cpp index 8f11f4d9ff1..26342c23db9 100644 --- a/src/codegen/type/date_type.cpp +++ b/src/codegen/type/date_type.cpp @@ -130,11 +130,12 @@ struct CompareDate : public TypeSystem::SimpleComparisonHandleNull { std::vector kImplicitCastingTable = { peloton::type::TypeId::DATE, peloton::type::TypeId::TIMESTAMP}; +// clang-format off // Explicit casts CastDateToTimestamp kDateToTimestamp; std::vector kExplicitCastingTable = { - {peloton::type::TypeId::DATE, peloton::type::TypeId::TIMESTAMP, - kDateToTimestamp}}; + {peloton::type::TypeId::DATE, peloton::type::TypeId::TIMESTAMP, kDateToTimestamp}}; +// clang-format on // Comparison operations CompareDate kCompareDate; @@ -187,6 +188,12 @@ void Date::GetTypeForMaterialization(CodeGen &codegen, llvm::Type *&val_type, len_type = nullptr; } +llvm::Function *Date::GetInputFunction( + UNUSED_ATTRIBUTE CodeGen &codegen, + UNUSED_ATTRIBUTE const Type &type) const { + throw NotImplementedException{"Date inputs not supported yet"}; +} + llvm::Function *Date::GetOutputFunction( CodeGen &codegen, UNUSED_ATTRIBUTE const Type &type) const { return ValuesRuntimeProxy::OutputDate.GetFunction(codegen); diff --git a/src/codegen/type/decimal_type.cpp b/src/codegen/type/decimal_type.cpp index f081013e0b2..50a0b09e29c 100644 --- a/src/codegen/type/decimal_type.cpp +++ b/src/codegen/type/decimal_type.cpp @@ -192,9 +192,9 @@ struct Abs : public TypeSystem::UnaryOperatorHandleNull { Value Impl(CodeGen &codegen, const Value &val, UNUSED_ATTRIBUTE const TypeSystem::InvocationContext &ctx) - const override { + const override { llvm::Value *raw_ret = - codegen.Call(DecimalFunctionsProxy::Abs, {val.GetValue()}); + codegen.Call(DecimalFunctionsProxy::Abs, {val.GetValue()}); return Value{Decimal::Instance(), raw_ret}; } }; @@ -473,21 +473,17 @@ struct Modulo : public TypeSystem::BinaryOperatorHandleNull { std::vector kImplicitCastingTable = { peloton::type::TypeId::DECIMAL}; +// clang-format off // Explicit casting rules CastDecimal kCastDecimal; std::vector kExplicitCastingTable = { - {peloton::type::TypeId::DECIMAL, peloton::type::TypeId::BOOLEAN, - kCastDecimal}, - {peloton::type::TypeId::DECIMAL, peloton::type::TypeId::TINYINT, - kCastDecimal}, - {peloton::type::TypeId::DECIMAL, peloton::type::TypeId::SMALLINT, - kCastDecimal}, - {peloton::type::TypeId::DECIMAL, peloton::type::TypeId::INTEGER, - kCastDecimal}, - {peloton::type::TypeId::DECIMAL, peloton::type::TypeId::BIGINT, - kCastDecimal}, - {peloton::type::TypeId::DECIMAL, peloton::type::TypeId::DECIMAL, - kCastDecimal}}; + {peloton::type::TypeId::DECIMAL, peloton::type::TypeId::BOOLEAN, kCastDecimal}, + {peloton::type::TypeId::DECIMAL, peloton::type::TypeId::TINYINT, kCastDecimal}, + {peloton::type::TypeId::DECIMAL, peloton::type::TypeId::SMALLINT, kCastDecimal}, + {peloton::type::TypeId::DECIMAL, peloton::type::TypeId::INTEGER, kCastDecimal}, + {peloton::type::TypeId::DECIMAL, peloton::type::TypeId::BIGINT, kCastDecimal}, + {peloton::type::TypeId::DECIMAL, peloton::type::TypeId::DECIMAL, kCastDecimal}}; +// clang-format on // Comparison operations CompareDecimal kCompareDecimal; @@ -562,6 +558,12 @@ void Decimal::GetTypeForMaterialization(CodeGen &codegen, llvm::Type *&val_type, len_type = nullptr; } +llvm::Function *Decimal::GetInputFunction( + UNUSED_ATTRIBUTE CodeGen &codegen, + UNUSED_ATTRIBUTE const Type &type) const { + throw NotImplementedException{"Decimal inputs not implemented yet"}; +} + llvm::Function *Decimal::GetOutputFunction( CodeGen &codegen, UNUSED_ATTRIBUTE const Type &type) const { // TODO: We should be using the precision/scale in the output function diff --git a/src/codegen/type/integer_type.cpp b/src/codegen/type/integer_type.cpp index dc49056a5d1..92809098341 100644 --- a/src/codegen/type/integer_type.cpp +++ b/src/codegen/type/integer_type.cpp @@ -187,8 +187,7 @@ struct Abs : public TypeSystem::UnaryOperatorHandleNull { } Value Impl(CodeGen &codegen, const Value &val, - const TypeSystem::InvocationContext &ctx) - const override { + const TypeSystem::InvocationContext &ctx) const override { // The integer subtraction implementation Sub sub; // Zero place-holder @@ -197,7 +196,8 @@ struct Abs : public TypeSystem::UnaryOperatorHandleNull { // We want: raw_ret = (val < 0 ? 0 - val : val) auto sub_result = sub.Impl(codegen, zero, val, ctx); auto *lt_zero = codegen->CreateICmpSLT(val.GetValue(), zero.GetValue()); - auto *raw_ret = codegen->CreateSelect(lt_zero, sub_result.GetValue(), val.GetValue()); + auto *raw_ret = + codegen->CreateSelect(lt_zero, sub_result.GetValue(), val.GetValue()); return Value{Integer::Instance(), raw_ret}; } }; @@ -251,7 +251,7 @@ struct Floor : public TypeSystem::UnaryOperatorHandleNull { // Ceiling struct Ceil : public TypeSystem::UnaryOperatorHandleNull { CastInteger cast; - + bool SupportsType(const Type &type) const override { return type.GetSqlType() == Integer::Instance(); } @@ -283,7 +283,7 @@ struct Sqrt : public TypeSystem::UnaryOperatorHandleNull { protected: Value Impl(CodeGen &codegen, const Value &val, UNUSED_ATTRIBUTE const TypeSystem::InvocationContext &ctx) - const override { + const override { auto casted = cast.Impl(codegen, val, Decimal::Instance()); auto *raw_ret = codegen.Sqrt(casted.GetValue()); return Value{Decimal::Instance(), raw_ret}; @@ -328,10 +328,9 @@ struct Add : public TypeSystem::BinaryOperatorHandleNull { }; // Subtraction -bool Sub::SupportsTypes(const Type &left_type, - const Type &right_type) const { +bool Sub::SupportsTypes(const Type &left_type, const Type &right_type) const { return left_type.GetSqlType() == Integer::Instance() && - left_type == right_type; + left_type == right_type; } Type Sub::ResultType(UNUSED_ATTRIBUTE const Type &left_type, @@ -346,7 +345,7 @@ Value Sub::Impl(CodeGen &codegen, const Value &left, const Value &right, // Do subtraction llvm::Value *overflow_bit = nullptr; llvm::Value *result = codegen.CallSubWithOverflow( - left.GetValue(), right.GetValue(), overflow_bit); + left.GetValue(), right.GetValue(), overflow_bit); if (ctx.on_error == OnError::Exception) { codegen.ThrowIfOverflow(overflow_bit); @@ -510,26 +509,21 @@ std::vector kImplicitCastingTable = { peloton::type::TypeId::INTEGER, peloton::type::TypeId::BIGINT, peloton::type::TypeId::DECIMAL}; +// clang-format off // Explicit casting rules CastInteger kCastInteger; std::vector kExplicitCastingTable = { - {peloton::type::TypeId::INTEGER, peloton::type::TypeId::BOOLEAN, - kCastInteger}, - {peloton::type::TypeId::INTEGER, peloton::type::TypeId::TINYINT, - kCastInteger}, - {peloton::type::TypeId::INTEGER, peloton::type::TypeId::SMALLINT, - kCastInteger}, - {peloton::type::TypeId::INTEGER, peloton::type::TypeId::INTEGER, - kCastInteger}, - {peloton::type::TypeId::INTEGER, peloton::type::TypeId::BIGINT, - kCastInteger}, - {peloton::type::TypeId::INTEGER, peloton::type::TypeId::DECIMAL, - kCastInteger}}; + {peloton::type::TypeId::INTEGER, peloton::type::TypeId::BOOLEAN, kCastInteger}, + {peloton::type::TypeId::INTEGER, peloton::type::TypeId::TINYINT, kCastInteger}, + {peloton::type::TypeId::INTEGER, peloton::type::TypeId::SMALLINT, kCastInteger}, + {peloton::type::TypeId::INTEGER, peloton::type::TypeId::INTEGER, kCastInteger}, + {peloton::type::TypeId::INTEGER, peloton::type::TypeId::BIGINT, kCastInteger}, + {peloton::type::TypeId::INTEGER, peloton::type::TypeId::DECIMAL, kCastInteger}}; +// clang-format on // Comparison operations CompareInteger kCompareInteger; -std::vector kComparisonTable = { - {kCompareInteger}}; +std::vector kComparisonTable = {{kCompareInteger}}; // Unary operators Negate kNegOp; @@ -599,6 +593,11 @@ void Integer::GetTypeForMaterialization(CodeGen &codegen, llvm::Type *&val_type, len_type = nullptr; } +llvm::Function *Integer::GetInputFunction( + CodeGen &codegen, UNUSED_ATTRIBUTE const Type &type) const { + return ValuesRuntimeProxy::InputInteger.GetFunction(codegen); +} + llvm::Function *Integer::GetOutputFunction( CodeGen &codegen, UNUSED_ATTRIBUTE const Type &type) const { return ValuesRuntimeProxy::OutputInteger.GetFunction(codegen); diff --git a/src/codegen/type/smallint_type.cpp b/src/codegen/type/smallint_type.cpp index 408523ea583..e0f31561c95 100644 --- a/src/codegen/type/smallint_type.cpp +++ b/src/codegen/type/smallint_type.cpp @@ -194,9 +194,7 @@ struct Abs : public TypeSystem::UnaryOperatorHandleNull { } Value Impl(CodeGen &codegen, const Value &val, - const TypeSystem::InvocationContext &ctx) - const override { - + const TypeSystem::InvocationContext &ctx) const override { // The smallint subtraction implementation Sub sub; PELOTON_ASSERT(SupportsType(val.GetType())); @@ -206,7 +204,8 @@ struct Abs : public TypeSystem::UnaryOperatorHandleNull { // We want: raw_ret = (val < 0 ? 0 - val : val) auto sub_result = sub.Impl(codegen, zero, val, ctx); auto *lt_zero = codegen->CreateICmpSLT(val.GetValue(), zero.GetValue()); - auto *raw_ret = codegen->CreateSelect(lt_zero, sub_result.GetValue(), val.GetValue()); + auto *raw_ret = + codegen->CreateSelect(lt_zero, sub_result.GetValue(), val.GetValue()); return Value{SmallInt::Instance(), raw_ret}; } }; @@ -338,10 +337,9 @@ struct Add : public TypeSystem::BinaryOperatorHandleNull { // Subtraction -bool Sub::SupportsTypes(const Type &left_type, - const Type &right_type) const { +bool Sub::SupportsTypes(const Type &left_type, const Type &right_type) const { return left_type.GetSqlType() == SmallInt::Instance() && - left_type == right_type; + left_type == right_type; } Type Sub::ResultType(UNUSED_ATTRIBUTE const Type &left_type, @@ -356,7 +354,7 @@ Value Sub::Impl(CodeGen &codegen, const Value &left, const Value &right, // Do subtraction llvm::Value *overflow_bit = nullptr; llvm::Value *result = codegen.CallSubWithOverflow( - left.GetValue(), right.GetValue(), overflow_bit); + left.GetValue(), right.GetValue(), overflow_bit); if (ctx.on_error == OnError::Exception) { codegen.ThrowIfOverflow(overflow_bit); @@ -522,21 +520,17 @@ std::vector kImplicitCastingTable = { peloton::type::TypeId::SMALLINT, peloton::type::TypeId::INTEGER, peloton::type::TypeId::BIGINT, peloton::type::TypeId::DECIMAL}; +// clang-format off // Explicit casting rules CastSmallInt kCastSmallInt; std::vector kExplicitCastingTable = { - {peloton::type::TypeId::SMALLINT, peloton::type::TypeId::BOOLEAN, - kCastSmallInt}, - {peloton::type::TypeId::SMALLINT, peloton::type::TypeId::TINYINT, - kCastSmallInt}, - {peloton::type::TypeId::SMALLINT, peloton::type::TypeId::SMALLINT, - kCastSmallInt}, - {peloton::type::TypeId::SMALLINT, peloton::type::TypeId::INTEGER, - kCastSmallInt}, - {peloton::type::TypeId::SMALLINT, peloton::type::TypeId::BIGINT, - kCastSmallInt}, - {peloton::type::TypeId::SMALLINT, peloton::type::TypeId::DECIMAL, - kCastSmallInt}}; + {peloton::type::TypeId::SMALLINT, peloton::type::TypeId::BOOLEAN, kCastSmallInt}, + {peloton::type::TypeId::SMALLINT, peloton::type::TypeId::TINYINT, kCastSmallInt}, + {peloton::type::TypeId::SMALLINT, peloton::type::TypeId::SMALLINT, kCastSmallInt}, + {peloton::type::TypeId::SMALLINT, peloton::type::TypeId::INTEGER, kCastSmallInt}, + {peloton::type::TypeId::SMALLINT, peloton::type::TypeId::BIGINT, kCastSmallInt}, + {peloton::type::TypeId::SMALLINT, peloton::type::TypeId::DECIMAL, kCastSmallInt}}; +// clang-format on // Comparison operations CompareSmallInt kCompareSmallInt; @@ -610,6 +604,11 @@ void SmallInt::GetTypeForMaterialization(CodeGen &codegen, len_type = nullptr; } +llvm::Function *SmallInt::GetInputFunction( + CodeGen &codegen, UNUSED_ATTRIBUTE const Type &type) const { + return ValuesRuntimeProxy::InputSmallInt.GetFunction(codegen); +} + llvm::Function *SmallInt::GetOutputFunction( CodeGen &codegen, UNUSED_ATTRIBUTE const Type &type) const { return ValuesRuntimeProxy::OutputSmallInt.GetFunction(codegen); diff --git a/src/codegen/type/sql_type.cpp b/src/codegen/type/sql_type.cpp index 6901976b008..49613d6d378 100644 --- a/src/codegen/type/sql_type.cpp +++ b/src/codegen/type/sql_type.cpp @@ -54,6 +54,12 @@ class Invalid : public SqlType, public Singleton { throw Exception{"INVALID type doesn't have a materialization type"}; } + llvm::Function *GetInputFunction( + UNUSED_ATTRIBUTE CodeGen &codegen, + UNUSED_ATTRIBUTE const Type &type) const override { + throw Exception{"INVALID type does not have an input function"}; + } + llvm::Function *GetOutputFunction( UNUSED_ATTRIBUTE CodeGen &codegen, UNUSED_ATTRIBUTE const Type &type) const override { diff --git a/src/codegen/type/timestamp_type.cpp b/src/codegen/type/timestamp_type.cpp index 73603f222b2..68dcd180f0f 100644 --- a/src/codegen/type/timestamp_type.cpp +++ b/src/codegen/type/timestamp_type.cpp @@ -148,11 +148,12 @@ struct Now : public TypeSystem::NoArgOperator { std::vector kImplicitCastingTable = { peloton::type::TypeId::DATE, peloton::type::TypeId::TIMESTAMP}; +// clang-format off // Explicit casts CastTimestampToDate kTimestampToDate; std::vector kExplicitCastingTable = { - {peloton::type::TypeId::TIMESTAMP, peloton::type::TypeId::DATE, - kTimestampToDate}}; + {peloton::type::TypeId::TIMESTAMP, peloton::type::TypeId::DATE, kTimestampToDate}}; +// clang-format on // Comparisons CompareTimestamp kCompareTimestamp; @@ -209,6 +210,12 @@ void Timestamp::GetTypeForMaterialization(CodeGen &codegen, len_type = nullptr; } +llvm::Function *Timestamp::GetInputFunction( + UNUSED_ATTRIBUTE CodeGen &codegen, + UNUSED_ATTRIBUTE const Type &type) const { + throw NotImplementedException{"Timestamp input not implemented yet"}; +} + llvm::Function *Timestamp::GetOutputFunction( CodeGen &codegen, UNUSED_ATTRIBUTE const Type &type) const { return ValuesRuntimeProxy::OutputTimestamp.GetFunction(codegen); diff --git a/src/codegen/type/tinyint_type.cpp b/src/codegen/type/tinyint_type.cpp index 254ef0d8e47..24cad11558c 100644 --- a/src/codegen/type/tinyint_type.cpp +++ b/src/codegen/type/tinyint_type.cpp @@ -516,21 +516,17 @@ std::vector kImplicitCastingTable = { peloton::type::TypeId::INTEGER, peloton::type::TypeId::BIGINT, peloton::type::TypeId::DECIMAL}; +// clang-format off // Explicit casting rules CastTinyInt kCastTinyInt; std::vector kExplicitCastingTable = { - {peloton::type::TypeId::TINYINT, peloton::type::TypeId::BOOLEAN, - kCastTinyInt}, - {peloton::type::TypeId::TINYINT, peloton::type::TypeId::TINYINT, - kCastTinyInt}, - {peloton::type::TypeId::TINYINT, peloton::type::TypeId::SMALLINT, - kCastTinyInt}, - {peloton::type::TypeId::TINYINT, peloton::type::TypeId::INTEGER, - kCastTinyInt}, - {peloton::type::TypeId::TINYINT, peloton::type::TypeId::BIGINT, - kCastTinyInt}, - {peloton::type::TypeId::TINYINT, peloton::type::TypeId::DECIMAL, - kCastTinyInt}}; + {peloton::type::TypeId::TINYINT, peloton::type::TypeId::BOOLEAN, kCastTinyInt}, + {peloton::type::TypeId::TINYINT, peloton::type::TypeId::TINYINT, kCastTinyInt}, + {peloton::type::TypeId::TINYINT, peloton::type::TypeId::SMALLINT, kCastTinyInt}, + {peloton::type::TypeId::TINYINT, peloton::type::TypeId::INTEGER, kCastTinyInt}, + {peloton::type::TypeId::TINYINT, peloton::type::TypeId::BIGINT, kCastTinyInt}, + {peloton::type::TypeId::TINYINT, peloton::type::TypeId::DECIMAL, kCastTinyInt}}; +// clang-format on // Comparison operations CompareTinyInt kCompareTinyInt; @@ -603,6 +599,11 @@ void TinyInt::GetTypeForMaterialization(CodeGen &codegen, llvm::Type *&val_type, len_type = nullptr; } +llvm::Function *TinyInt::GetInputFunction( + CodeGen &codegen, UNUSED_ATTRIBUTE const Type &type) const { + return ValuesRuntimeProxy::InputTinyInt.GetFunction(codegen); +} + llvm::Function *TinyInt::GetOutputFunction( CodeGen &codegen, UNUSED_ATTRIBUTE const Type &type) const { return ValuesRuntimeProxy::OutputTinyInt.GetFunction(codegen); diff --git a/src/codegen/type/varbinary_type.cpp b/src/codegen/type/varbinary_type.cpp index 7706545c84c..bcbf0c8a1de 100644 --- a/src/codegen/type/varbinary_type.cpp +++ b/src/codegen/type/varbinary_type.cpp @@ -159,9 +159,8 @@ std::vector kNoArgOperatorTable = {}; Varbinary::Varbinary() : SqlType(peloton::type::TypeId::VARBINARY), type_system_(kImplicitCastingTable, kExplicitCastingTable, - kComparisonTable, kUnaryOperatorTable, - kBinaryOperatorTable, kNaryOperatorTable, - kNoArgOperatorTable) {} + kComparisonTable, kUnaryOperatorTable, kBinaryOperatorTable, + kNaryOperatorTable, kNoArgOperatorTable) {} Value Varbinary::GetMinValue(UNUSED_ATTRIBUTE CodeGen &codegen) const { throw Exception{"The VARBINARY type does not have a minimum value ..."}; @@ -183,6 +182,12 @@ void Varbinary::GetTypeForMaterialization(CodeGen &codegen, len_type = codegen.Int32Type(); } +llvm::Function *Varbinary::GetInputFunction( + UNUSED_ATTRIBUTE CodeGen &codegen, + UNUSED_ATTRIBUTE const Type &type) const { + throw NotImplementedException{"Blob input not implemented yet"}; +} + llvm::Function *Varbinary::GetOutputFunction( CodeGen &codegen, UNUSED_ATTRIBUTE const Type &type) const { // TODO: We should use the length information in the type? diff --git a/src/codegen/type/varchar_type.cpp b/src/codegen/type/varchar_type.cpp index 0066457e425..001b6afaca9 100644 --- a/src/codegen/type/varchar_type.cpp +++ b/src/codegen/type/varchar_type.cpp @@ -498,11 +498,8 @@ struct Substr : public TypeSystem::NaryOperator { // Setup function arguments llvm::Value *executor_ctx = ctx.executor_context; std::vector args = { - executor_ctx, - input_args[0].GetValue(), - input_args[0].GetLength(), - input_args[1].GetValue(), - input_args[2].GetValue(), + executor_ctx, input_args[0].GetValue(), input_args[0].GetLength(), + input_args[1].GetValue(), input_args[2].GetValue(), }; // Call @@ -550,9 +547,12 @@ LTrim kLTrim; RTrim kRTrim; Repeat kRepeat; std::vector kBinaryOperatorTable = { - {OperatorId::Like, kLike}, {OperatorId::DateTrunc, kDateTrunc}, - {OperatorId::DatePart, kDatePart}, {OperatorId::BTrim, kBTrim}, - {OperatorId::LTrim, kLTrim}, {OperatorId::RTrim, kRTrim}, + {OperatorId::Like, kLike}, + {OperatorId::DateTrunc, kDateTrunc}, + {OperatorId::DatePart, kDatePart}, + {OperatorId::BTrim, kBTrim}, + {OperatorId::LTrim, kLTrim}, + {OperatorId::RTrim, kRTrim}, {OperatorId::Repeat, kRepeat}}; // Nary operations @@ -596,6 +596,12 @@ void Varchar::GetTypeForMaterialization(CodeGen &codegen, llvm::Type *&val_type, len_type = codegen.Int32Type(); } +llvm::Function *Varchar::GetInputFunction( + UNUSED_ATTRIBUTE CodeGen &codegen, + UNUSED_ATTRIBUTE const Type &type) const { + throw NotImplementedException{"String input not implemented yet"}; +} + llvm::Function *Varchar::GetOutputFunction( CodeGen &codegen, UNUSED_ATTRIBUTE const Type &type) const { // TODO: We should use the length information in the type? diff --git a/src/codegen/values_runtime.cpp b/src/codegen/values_runtime.cpp index 33977174925..1e3324ade4d 100644 --- a/src/codegen/values_runtime.cpp +++ b/src/codegen/values_runtime.cpp @@ -24,6 +24,12 @@ namespace peloton { namespace codegen { +//////////////////////////////////////////////////////////////////////////////// +/// +/// Output functions +/// +//////////////////////////////////////////////////////////////////////////////// + namespace { inline void SetValue(peloton::type::Value *val_ptr, @@ -94,8 +100,22 @@ void ValuesRuntime::OutputVarbinary(char *values, uint32_t idx, const char *ptr, peloton::type::ValueFactory::GetVarbinaryValue(bin_ptr, len, false)); } +//////////////////////////////////////////////////////////////////////////////// +/// +/// Input functions +/// +//////////////////////////////////////////////////////////////////////////////// + namespace { +/** + * Skip all leading and trailing whitespace from the string bounded by the + * provided pointers. This function will modify the input pointers to point to + * the first non-space character at the start and end of the input string. + * + * @param[in,out] left A pointer to the leftmost character in the input string + * @param[in,out] right A pointer to the rightmost character in the input string + */ void TrimLeftRight(char *&left, char *&right) { while (*left == ' ') { left++; @@ -105,6 +125,17 @@ void TrimLeftRight(char *&left, char *&right) { } } +/** + * Convert the provided input string into a integral number. This function + * handles leading whitespace and leading negative (-) or positive (+) signs. + * Additionally, it performs a bounds check to ensure the number falls into the + * valid range of numbers for the given type. + * + * @tparam T The integral type (int8_t, int16_t, int32_t, int64_t) + * @param ptr A pointer to the start of the input string + * @param len The length of the input string + * @return The numeric interpretation of the input string + */ template typename std::enable_if::value, T>::type ToNum( char *ptr, uint32_t len) { @@ -113,7 +144,7 @@ typename std::enable_if::value, T>::type ToNum( // ERROR } - // Trim whitespace on left and right + // Trim leading and trailing whitespace TrimLeftRight(start, end); // Check negative or positive sign @@ -125,6 +156,7 @@ typename std::enable_if::value, T>::type ToNum( start++; } + // Convert int64_t num = 0; while (start != end) { if (*start < '0' || *start > '9') { @@ -136,15 +168,18 @@ typename std::enable_if::value, T>::type ToNum( start++; } + // Negate number if we need to if (negative) { num = -num; } + // Perform bounds check if (num <= std::numeric_limits::min() || num >= std::numeric_limits::max()) { RuntimeFunctions::ThrowOverflowException(); } + // Done return static_cast(num); } @@ -157,7 +192,7 @@ bool ValuesRuntime::InputBoolean(UNUSED_ATTRIBUTE const type::Type &type, char *start = ptr, *end = ptr + len; - // Trim whitespace on both ends + // Trim leading and trailing whitespace TrimLeftRight(start, end); // diff --git a/src/include/codegen/proxy/values_runtime_proxy.h b/src/include/codegen/proxy/values_runtime_proxy.h index 77f78979572..3fe57ab36fb 100644 --- a/src/include/codegen/proxy/values_runtime_proxy.h +++ b/src/include/codegen/proxy/values_runtime_proxy.h @@ -30,11 +30,11 @@ PROXY(ValuesRuntime) { DECLARE_METHOD(OutputVarchar); DECLARE_METHOD(OutputVarbinary); - DECLARE_MEMBER(InputBoolean); - DECLARE_MEMBER(InputTinyInt); - DECLARE_MEMBER(InputSmallInt); - DECLARE_MEMBER(InputInteger); - DECLARE_MEMBER(InputBigInt); + DECLARE_METHOD(InputBoolean); + DECLARE_METHOD(InputTinyInt); + DECLARE_METHOD(InputSmallInt); + DECLARE_METHOD(InputInteger); + DECLARE_METHOD(InputBigInt); DECLARE_METHOD(CompareStrings); }; diff --git a/src/include/codegen/type/array_type.h b/src/include/codegen/type/array_type.h index e3b0fe7cc6a..052e55ca4ca 100644 --- a/src/include/codegen/type/array_type.h +++ b/src/include/codegen/type/array_type.h @@ -33,6 +33,9 @@ class Array : public SqlType, public Singleton { void GetTypeForMaterialization(CodeGen &codegen, llvm::Type *&val_type, llvm::Type *&len_type) const override; + llvm::Function *GetInputFunction(CodeGen &codegen, + const Type &type) const override; + llvm::Function *GetOutputFunction(CodeGen &codegen, const Type &type) const override; diff --git a/src/include/codegen/type/bigint_type.h b/src/include/codegen/type/bigint_type.h index 043e71a3e91..9f2abfe7aea 100644 --- a/src/include/codegen/type/bigint_type.h +++ b/src/include/codegen/type/bigint_type.h @@ -33,6 +33,9 @@ class BigInt : public SqlType, public Singleton { void GetTypeForMaterialization(CodeGen &codegen, llvm::Type *&val_type, llvm::Type *&len_type) const override; + llvm::Function *GetInputFunction(CodeGen &codegen, + const Type &type) const override; + llvm::Function *GetOutputFunction(CodeGen &codegen, const Type &type) const override; diff --git a/src/include/codegen/type/boolean_type.h b/src/include/codegen/type/boolean_type.h index 3c070b18714..5e854ba800e 100644 --- a/src/include/codegen/type/boolean_type.h +++ b/src/include/codegen/type/boolean_type.h @@ -35,6 +35,9 @@ class Boolean : public SqlType, public Singleton { void GetTypeForMaterialization(CodeGen &codegen, llvm::Type *&val_type, llvm::Type *&len_type) const override; + llvm::Function *GetInputFunction(CodeGen &codegen, + const Type &type) const override; + llvm::Function *GetOutputFunction(CodeGen &codegen, const Type &type) const override; diff --git a/src/include/codegen/type/date_type.h b/src/include/codegen/type/date_type.h index 03cf5da7827..225420e59c8 100644 --- a/src/include/codegen/type/date_type.h +++ b/src/include/codegen/type/date_type.h @@ -33,6 +33,9 @@ class Date : public SqlType, public Singleton { void GetTypeForMaterialization(CodeGen &codegen, llvm::Type *&val_type, llvm::Type *&len_type) const override; + llvm::Function *GetInputFunction(CodeGen &codegen, + const Type &type) const override; + llvm::Function *GetOutputFunction(CodeGen &codegen, const Type &type) const override; diff --git a/src/include/codegen/type/decimal_type.h b/src/include/codegen/type/decimal_type.h index b180fc2b4eb..6260fb98aba 100644 --- a/src/include/codegen/type/decimal_type.h +++ b/src/include/codegen/type/decimal_type.h @@ -33,6 +33,9 @@ class Decimal : public SqlType, public Singleton { void GetTypeForMaterialization(CodeGen &codegen, llvm::Type *&val_type, llvm::Type *&len_type) const override; + llvm::Function *GetInputFunction(CodeGen &codegen, + const Type &type) const override; + llvm::Function *GetOutputFunction(CodeGen &codegen, const Type &type) const override; diff --git a/src/include/codegen/type/integer_type.h b/src/include/codegen/type/integer_type.h index b8f6d97ea4f..dbc2b30957e 100644 --- a/src/include/codegen/type/integer_type.h +++ b/src/include/codegen/type/integer_type.h @@ -33,6 +33,9 @@ class Integer : public SqlType, public Singleton { void GetTypeForMaterialization(CodeGen &codegen, llvm::Type *&val_type, llvm::Type *&len_type) const override; + llvm::Function *GetInputFunction(CodeGen &codegen, + const Type &type) const override; + llvm::Function *GetOutputFunction(CodeGen &codegen, const Type &type) const override; diff --git a/src/include/codegen/type/smallint_type.h b/src/include/codegen/type/smallint_type.h index 86f0e9a2cb3..9c1068a0a82 100644 --- a/src/include/codegen/type/smallint_type.h +++ b/src/include/codegen/type/smallint_type.h @@ -33,6 +33,9 @@ class SmallInt : public SqlType, public Singleton { void GetTypeForMaterialization(CodeGen &codegen, llvm::Type *&val_type, llvm::Type *&len_type) const override; + llvm::Function *GetInputFunction(CodeGen &codegen, + const Type &type) const override; + llvm::Function *GetOutputFunction(CodeGen &codegen, const Type &type) const override; diff --git a/src/include/codegen/type/sql_type.h b/src/include/codegen/type/sql_type.h index a9232339570..256073bf80e 100644 --- a/src/include/codegen/type/sql_type.h +++ b/src/include/codegen/type/sql_type.h @@ -56,6 +56,8 @@ class SqlType { virtual void GetTypeForMaterialization(CodeGen &codegen, llvm::Type *&val_type, llvm::Type *&len_type) const = 0; + virtual llvm::Function *GetInputFunction(CodeGen &codegen, + const Type &type) const = 0; virtual llvm::Function *GetOutputFunction(CodeGen &codegen, const Type &type) const = 0; virtual const TypeSystem &GetTypeSystem() const = 0; diff --git a/src/include/codegen/type/timestamp_type.h b/src/include/codegen/type/timestamp_type.h index b185cc349bf..febc95f1077 100644 --- a/src/include/codegen/type/timestamp_type.h +++ b/src/include/codegen/type/timestamp_type.h @@ -33,6 +33,9 @@ class Timestamp : public SqlType, public Singleton { void GetTypeForMaterialization(CodeGen &codegen, llvm::Type *&val_type, llvm::Type *&len_type) const override; + llvm::Function *GetInputFunction(CodeGen &codegen, + const Type &type) const override; + llvm::Function *GetOutputFunction(CodeGen &codegen, const Type &type) const override; diff --git a/src/include/codegen/type/tinyint_type.h b/src/include/codegen/type/tinyint_type.h index ae7cbd86b18..8593dd7b1de 100644 --- a/src/include/codegen/type/tinyint_type.h +++ b/src/include/codegen/type/tinyint_type.h @@ -33,6 +33,9 @@ class TinyInt : public SqlType, public Singleton { void GetTypeForMaterialization(CodeGen &codegen, llvm::Type *&val_type, llvm::Type *&len_type) const override; + llvm::Function *GetInputFunction(CodeGen &codegen, + const Type &type) const override; + llvm::Function *GetOutputFunction(CodeGen &codegen, const Type &type) const override; diff --git a/src/include/codegen/type/varbinary_type.h b/src/include/codegen/type/varbinary_type.h index 54974e0a613..b9ad9cd3cf0 100644 --- a/src/include/codegen/type/varbinary_type.h +++ b/src/include/codegen/type/varbinary_type.h @@ -33,6 +33,9 @@ class Varbinary : public SqlType, public Singleton { void GetTypeForMaterialization(CodeGen &codegen, llvm::Type *&val_type, llvm::Type *&len_type) const override; + llvm::Function *GetInputFunction(CodeGen &codegen, + const Type &type) const override; + llvm::Function *GetOutputFunction(CodeGen &codegen, const Type &type) const override; diff --git a/src/include/codegen/type/varchar_type.h b/src/include/codegen/type/varchar_type.h index 796d493772a..1664a8a10c7 100644 --- a/src/include/codegen/type/varchar_type.h +++ b/src/include/codegen/type/varchar_type.h @@ -33,6 +33,9 @@ class Varchar : public SqlType, public Singleton { void GetTypeForMaterialization(CodeGen &codegen, llvm::Type *&val_type, llvm::Type *&len_type) const override; + llvm::Function *GetInputFunction(CodeGen &codegen, + const Type &type) const override; + llvm::Function *GetOutputFunction(CodeGen &codegen, const Type &type) const override; From 3e3d689b5d70e2e571f453dbfaa1d2955e64cfd1 Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Mon, 30 Apr 2018 10:36:01 -0400 Subject: [PATCH 12/42] Added test for value integrity --- src/codegen/values_runtime.cpp | 28 +++++----- src/include/codegen/values_runtime.h | 15 ++++-- test/codegen/value_integrity_test.cpp | 76 +++++++++++++++++++++++++++ 3 files changed, 99 insertions(+), 20 deletions(-) diff --git a/src/codegen/values_runtime.cpp b/src/codegen/values_runtime.cpp index 1e3324ade4d..dddc0a43ac6 100644 --- a/src/codegen/values_runtime.cpp +++ b/src/codegen/values_runtime.cpp @@ -116,12 +116,12 @@ namespace { * @param[in,out] left A pointer to the leftmost character in the input string * @param[in,out] right A pointer to the rightmost character in the input string */ -void TrimLeftRight(char *&left, char *&right) { +void TrimLeftRight(const char *&left, const char *&right) { while (*left == ' ') { left++; } - while (*right == ' ') { - right++; + while (right > left && *(right - 1) == ' ') { + right--; } } @@ -138,11 +138,9 @@ void TrimLeftRight(char *&left, char *&right) { */ template typename std::enable_if::value, T>::type ToNum( - char *ptr, uint32_t len) { - char *start = ptr, *end = ptr + len; - if (start == end) { - // ERROR - } + const char *ptr, uint32_t len) { + const char *start = ptr; + const char *end = start + len; // Trim leading and trailing whitespace TrimLeftRight(start, end); @@ -173,7 +171,7 @@ typename std::enable_if::value, T>::type ToNum( num = -num; } - // Perform bounds check + // Range check if (num <= std::numeric_limits::min() || num >= std::numeric_limits::max()) { RuntimeFunctions::ThrowOverflowException(); @@ -186,11 +184,11 @@ typename std::enable_if::value, T>::type ToNum( } // namespace bool ValuesRuntime::InputBoolean(UNUSED_ATTRIBUTE const type::Type &type, - char *ptr, uint32_t len) { + const char *ptr, uint32_t len) { PELOTON_ASSERT(ptr != nullptr && "Input is assumed to be non-NULL"); PELOTON_ASSERT(len != 0 && "Length must be non-zero"); - char *start = ptr, *end = ptr + len; + const char *start = ptr, *end = ptr + len; // Trim leading and trailing whitespace TrimLeftRight(start, end); @@ -267,28 +265,28 @@ bool ValuesRuntime::InputBoolean(UNUSED_ATTRIBUTE const type::Type &type, } int8_t ValuesRuntime::InputTinyInt(UNUSED_ATTRIBUTE const type::Type &type, - char *ptr, uint32_t len) { + const char *ptr, uint32_t len) { PELOTON_ASSERT(ptr != nullptr && "Input is assumed to be non-NULL"); PELOTON_ASSERT(len != 0 && "Length must be non-zero"); return ToNum(ptr, len); } int16_t ValuesRuntime::InputSmallInt(UNUSED_ATTRIBUTE const type::Type &type, - char *ptr, uint32_t len) { + const char *ptr, uint32_t len) { PELOTON_ASSERT(ptr != nullptr && "Input is assumed to be non-NULL"); PELOTON_ASSERT(len != 0 && "Length must be non-zero"); return ToNum(ptr, len); } int32_t ValuesRuntime::InputInteger(UNUSED_ATTRIBUTE const type::Type &type, - char *ptr, uint32_t len) { + const char *ptr, uint32_t len) { PELOTON_ASSERT(ptr != nullptr && "Input is assumed to be non-NULL"); PELOTON_ASSERT(len != 0 && "Length must be non-zero"); return ToNum(ptr, len); } int64_t ValuesRuntime::InputBigInt(UNUSED_ATTRIBUTE const type::Type &type, - char *ptr, uint32_t len) { + const char *ptr, uint32_t len) { PELOTON_ASSERT(ptr != nullptr && "Input is assumed to be non-NULL"); PELOTON_ASSERT(len != 0 && "Length must be non-zero"); return ToNum(ptr, len); diff --git a/src/include/codegen/values_runtime.h b/src/include/codegen/values_runtime.h index e37396e5aa8..206e9ed9bb2 100644 --- a/src/include/codegen/values_runtime.h +++ b/src/include/codegen/values_runtime.h @@ -67,15 +67,20 @@ class ValuesRuntime { //// ////////////////////////////////////////////////////////////////////////////// - static bool InputBoolean(const type::Type &type, char *ptr, uint32_t len); + static bool InputBoolean(const type::Type &type, const char *ptr, + uint32_t len); - static int8_t InputTinyInt(const type::Type &type, char *ptr, uint32_t len); + static int8_t InputTinyInt(const type::Type &type, const char *ptr, + uint32_t len); - static int16_t InputSmallInt(const type::Type &type, char *ptr, uint32_t len); + static int16_t InputSmallInt(const type::Type &type, const char *ptr, + uint32_t len); - static int32_t InputInteger(const type::Type &type, char *ptr, uint32_t len); + static int32_t InputInteger(const type::Type &type, const char *ptr, + uint32_t len); - static int64_t InputBigInt(const type::Type &type, char *ptr, uint32_t len); + static int64_t InputBigInt(const type::Type &type, const char *ptr, + uint32_t len); /** * Compare two strings, returning an integer value indicating their sort order diff --git a/test/codegen/value_integrity_test.cpp b/test/codegen/value_integrity_test.cpp index 551e3956e75..4c4ccf97690 100644 --- a/test/codegen/value_integrity_test.cpp +++ b/test/codegen/value_integrity_test.cpp @@ -17,6 +17,7 @@ #include "codegen/type/smallint_type.h" #include "codegen/type/integer_type.h" #include "codegen/type/bigint_type.h" +#include "codegen/values_runtime.h" namespace peloton { namespace test { @@ -161,5 +162,80 @@ TEST_F(ValueIntegrityTest, IntegerDivideByZero) { } } +namespace { + +template +using InputFunc = T (*)(const codegen::type::Type &, const char *, uint32_t); + +template +void TestInputIntegral( + const codegen::type::Type &type, InputFunc TestFunc, + std::vector> extra_valid_tests = {}, + std::vector extra_invalid_tests = {}, + std::vector extra_overflow_tests = {}) { + // Default valid tests - these are valid for all integral types + std::vector> valid_tests = {{"0", 0}, + {"-1", -1}, + {"2", 2}, + {"+3", 3}, + {" 4", 4}, + {" -5", -5}, + {" +6", 6}, + {"7 ", 7}, + {"-8 ", -8}, + {" 9 ", 9}, + {" -10 ", -10}, + {" +11 ", 11}}; + valid_tests.insert(valid_tests.end(), extra_valid_tests.begin(), + extra_valid_tests.end()); + + // Default invalid tests + std::vector invalid_tests = {"a", "-b", "+c", " 1c", + "2d ", "3 3", "-4 4"}; + invalid_tests.insert(invalid_tests.end(), extra_invalid_tests.begin(), + extra_invalid_tests.end()); + + // Default overflow tests + std::vector overflow_tests = { + std::to_string(static_cast(std::numeric_limits::min()) - 1), + std::to_string(static_cast(std::numeric_limits::max()) + 1)}; + overflow_tests.insert(overflow_tests.end(), extra_overflow_tests.begin(), + extra_overflow_tests.end()); + + for (const auto &test : valid_tests) { + auto *ptr = test.first.data(); + auto len = static_cast(test.first.length()); + EXPECT_EQ(test.second, TestFunc(type, ptr, len)); + } + + for (const auto &test : invalid_tests) { + auto *ptr = test.data(); + auto len = static_cast(test.length()); + EXPECT_THROW(TestFunc(type, ptr, len), std::runtime_error); + } + + for (const auto &test : overflow_tests) { + auto *ptr = test.data(); + auto len = static_cast(test.length()); + EXPECT_THROW(TestFunc(type, ptr, len), std::overflow_error); + } +} +} // namespace + +TEST_F(ValueIntegrityTest, InputIntegralTypesTest) { + codegen::type::Type tinyint{type::TypeId::TINYINT, false}; + TestInputIntegral(tinyint, codegen::ValuesRuntime::InputTinyInt, + {{"-126", -126}, {"126", 126}}); + + codegen::type::Type smallint{type::TypeId::SMALLINT, false}; + TestInputIntegral(smallint, codegen::ValuesRuntime::InputSmallInt); + + codegen::type::Type integer{type::TypeId::INTEGER, false}; + TestInputIntegral(integer, codegen::ValuesRuntime::InputInteger); + + codegen::type::Type bigint{type::TypeId::BIGINT, false}; + TestInputIntegral(bigint, codegen::ValuesRuntime::InputBigInt); +} + } // namespace test } // namespace peloton \ No newline at end of file From 473b9b423ef5b6110c45b9df32dc3b4419e6e04c Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Mon, 30 Apr 2018 22:11:34 -0400 Subject: [PATCH 13/42] First take at CSV Scan translator --- src/codegen/codegen.cpp | 21 ++- src/codegen/operator/csv_scan_translator.cpp | 171 ++++++++++++++++++ src/codegen/proxy/csv_scanner_proxy.cpp | 31 ++++ src/codegen/query_compiler.cpp | 3 +- src/codegen/translator_factory.cpp | 7 + src/codegen/type/type.cpp | 13 +- src/codegen/util/csv_scanner.cpp | 75 ++++++++ src/include/codegen/codegen.h | 8 +- .../codegen/operator/csv_scan_translator.h | 74 ++++++++ src/include/codegen/proxy/csv_scanner_proxy.h | 47 +++++ src/include/codegen/type/type.h | 16 ++ src/include/codegen/util/csv_scanner.h | 65 +++++++ src/include/planner/aggregate_plan.h | 2 +- src/include/planner/insert_plan.h | 3 - src/planner/aggregate_plan.cpp | 48 ++--- src/planner/insert_plan.cpp | 13 +- 16 files changed, 545 insertions(+), 52 deletions(-) create mode 100644 src/codegen/operator/csv_scan_translator.cpp create mode 100644 src/codegen/proxy/csv_scanner_proxy.cpp create mode 100644 src/codegen/util/csv_scanner.cpp create mode 100644 src/include/codegen/operator/csv_scan_translator.h create mode 100644 src/include/codegen/proxy/csv_scanner_proxy.h create mode 100644 src/include/codegen/util/csv_scanner.h diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp index b6449ae4138..6a96a0f7542 100644 --- a/src/codegen/codegen.cpp +++ b/src/codegen/codegen.cpp @@ -60,7 +60,7 @@ llvm::Constant *CodeGen::ConstDouble(double val) const { } llvm::Value *CodeGen::ConstString(const std::string &str_val, - const std::string &name) const { + const std::string &name) const { // Strings are treated as arrays of bytes auto *str = llvm::ConstantDataArray::getString(GetContext(), str_val); auto *global_var = @@ -69,8 +69,18 @@ llvm::Value *CodeGen::ConstString(const std::string &str_val, return GetBuilder().CreateInBoundsGEP(global_var, {Const32(0), Const32(0)}); } -llvm::Value *CodeGen::ConstGenericBytes(llvm::Type *type, const void *data, - uint32_t length, +llvm::Value *CodeGen::ConstType(const type::Type &type) { + auto iter = type_variables_.find(type); + if (iter != type_variables_.end()) { + return iter->second; + } + const type::Type t = type; + llvm::Value *ret = ConstGenericBytes(&type, sizeof(type), "type"); + type_variables_.insert(std::make_pair(t, ret)); + return ret; +} + +llvm::Value *CodeGen::ConstGenericBytes(const void *data, uint32_t length, const std::string &name) const { // Create the constant data array that wraps the input data llvm::ArrayRef elements{reinterpret_cast(data), @@ -78,8 +88,9 @@ llvm::Value *CodeGen::ConstGenericBytes(llvm::Type *type, const void *data, auto *arr = llvm::ConstantDataArray::get(GetContext(), elements); // Create a global variable for the data - auto *global_var = new llvm::GlobalVariable( - GetModule(), type, true, llvm::GlobalValue::InternalLinkage, arr, name); + auto *global_var = + new llvm::GlobalVariable(GetModule(), arr->getType(), true, + llvm::GlobalValue::InternalLinkage, arr, name); // Return a pointer to the first element return GetBuilder().CreateInBoundsGEP(global_var, {Const32(0), Const32(0)}); diff --git a/src/codegen/operator/csv_scan_translator.cpp b/src/codegen/operator/csv_scan_translator.cpp new file mode 100644 index 00000000000..e38525ada35 --- /dev/null +++ b/src/codegen/operator/csv_scan_translator.cpp @@ -0,0 +1,171 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// csv_scan_translator.cpp +// +// Identification: src/codegen/operator/csv_scan_translator.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include "codegen/operator/csv_scan_translator.h" + +#include "codegen/compilation_context.h" +#include "codegen/function_builder.h" +#include "codegen/lang/if.h" +#include "codegen/operator/projection_translator.h" +#include "codegen/pipeline.h" +#include "codegen/proxy/csv_scanner_proxy.h" +#include "codegen/proxy/runtime_functions_proxy.h" +#include "codegen/type/sql_type.h" +#include "planner/csv_scan_plan.h" + +namespace peloton { +namespace codegen { + +CSVScanTranslator::CSVScanTranslator(const planner::CSVScanPlan &scan, + CompilationContext &context, + Pipeline &pipeline) + : OperatorTranslator(context, pipeline), scan_(scan) { + auto &runtime_state = context.GetRuntimeState(); + scanner_id_ = runtime_state.RegisterState( + "csvScanner", CSVScannerProxy::GetType(GetCodeGen())); +} + +void CSVScanTranslator::InitializeState() { + auto &codegen = GetCodeGen(); + + // Arguments + auto *scanner_ptr = LoadStatePtr(scanner_id_); + auto *file_path = codegen.ConstString(scan_.GetFileName(), "filePath"); + auto *output_col_types = ConstructColumnDescriptor(); + auto *runtime_state_ptr = codegen->CreatePointerCast( + codegen.GetState(), codegen.VoidType()->getPointerTo()); + + std::vector out_cols; + scan_.GetOutputColumns(out_cols); + auto *num_output_cols = + codegen.Const32(static_cast(out_cols.size())); + + auto *consumer_func = codegen->CreatePointerCast( + consumer_func_, proxy::TypeBuilder::GetType(codegen)); + + // Call + codegen.Call(CSVScannerProxy::Init, + {scanner_ptr, file_path, output_col_types, num_output_cols, + consumer_func, runtime_state_ptr}); +} + +void CSVScanTranslator::DefineAuxiliaryFunctions() { + // Define consumer function here + CodeGen &codegen = GetCodeGen(); + CompilationContext &cc = GetCompilationContext(); + + std::vector arg_types = { + {"runtimeState", + cc.GetRuntimeState().FinalizeType(codegen)->getPointerTo()}}; + codegen::FunctionDeclaration decl{codegen.GetCodeContext(), "consumer", + FunctionDeclaration::Visibility::Internal, + codegen.VoidType(), arg_types}; + codegen::FunctionBuilder scan_consumer{codegen.GetCodeContext(), decl}; + { + ConsumerContext ctx{cc, GetPipeline()}; + + Vector v{nullptr, 1, nullptr}; + RowBatch one{GetCompilationContext(), codegen.Const32(0), + codegen.Const32(1), v, false}; + RowBatch::Row row{one, nullptr, nullptr}; + + // Get the attributes + std::vector output_attributes; + scan_.GetAttributes(output_attributes); + + // Load the pointer to the columns view + auto *cols = codegen->CreateLoad(codegen->CreateConstInBoundsGEP2_32( + CSVScannerProxy::GetType(codegen), LoadStatePtr(scanner_id_), 0, 4)); + + // For each column, call the type's input function to read the input value + for (uint32_t i = 0; i < output_attributes.size(); i++) { + const auto *output_ai = output_attributes[i]; + + const auto &sql_type = output_ai->type.GetSqlType(); + + auto *is_null = codegen->CreateConstInBoundsGEP2_32( + CSVScannerColumnProxy::GetType(codegen), cols, i, 3); + + codegen::Value val, null_val; + lang::If not_null{codegen, + codegen->CreateNot(codegen->CreateLoad(is_null))}; + { + // Grab a pointer to the ptr and length + auto *type = codegen->CreatePointerCast( + codegen.ConstType(output_ai->type), + TypeProxy::GetType(codegen)->getPointerTo()); + auto *ptr = codegen->CreateLoad(codegen->CreateConstInBoundsGEP2_32( + CSVScannerColumnProxy::GetType(codegen), cols, i, 1)); + auto *len = codegen->CreateLoad(codegen->CreateConstInBoundsGEP2_32( + CSVScannerColumnProxy::GetType(codegen), cols, i, 2)); + + // Invoke the input function + auto *input_func = sql_type.GetInputFunction(codegen, output_ai->type); + auto *raw_val = codegen.CallFunc(input_func, {type, ptr, len}); + + // Non-null value + val = codegen::Value{output_ai->type, raw_val, nullptr, + codegen.ConstBool(false)}; + } + not_null.ElseBlock(); + { + // Null value + null_val = sql_type.GetNullValue(codegen); + } + not_null.EndIf(); + + codegen::Value final_val = not_null.BuildPHI(val, null_val); + row.RegisterAttributeValue(output_ai, final_val); + } + + ctx.Consume(row); + scan_consumer.ReturnAndFinish(); + } + consumer_func_ = scan_consumer.GetFunction(); +} + +void CSVScanTranslator::Produce() const { + auto *scanner_ptr = LoadStatePtr(scanner_id_); + GetCodeGen().Call(CSVScannerProxy::Produce, {scanner_ptr}); +} + +void CSVScanTranslator::TearDownState() { + auto *scanner_ptr = LoadStatePtr(scanner_id_); + GetCodeGen().Call(CSVScannerProxy::Destroy, {scanner_ptr}); +} + +std::string CSVScanTranslator::GetName() const { + return std::__cxx11::string(); +} + +llvm::Value *CSVScanTranslator::ConstructColumnDescriptor() const { + // First, we pull out all the attributes produced by the scan, in order + std::vector cols; + scan_.GetAttributes(cols); + + // But, what we really need are just the column types, so pull those out now + std::vector col_types_vec; + for (const auto *col : cols) { + col_types_vec.push_back(col->type); + } + + CodeGen &codegen = GetCodeGen(); + + auto num_bytes = cols.size() * sizeof(decltype(col_types_vec)::value_type); + auto *bytes = codegen.ConstGenericBytes( + col_types_vec.data(), static_cast(num_bytes), "colTypes"); + return codegen->CreatePointerCast( + bytes, TypeProxy::GetType(codegen)->getPointerTo()); +} + +} // namespace codegen +} // namespace peloton \ No newline at end of file diff --git a/src/codegen/proxy/csv_scanner_proxy.cpp b/src/codegen/proxy/csv_scanner_proxy.cpp new file mode 100644 index 00000000000..89b4b7dca16 --- /dev/null +++ b/src/codegen/proxy/csv_scanner_proxy.cpp @@ -0,0 +1,31 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// csv_scanner_proxy.cpp +// +// Identification: src/codegen/proxy/csv_scanner_proxy.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include "codegen/proxy/csv_scanner_proxy.h" + +#include "codegen/proxy/runtime_functions_proxy.h" + +namespace peloton { +namespace codegen { + +DEFINE_TYPE(CSVScanner, "util::CSVScanner", MEMBER(file_path), MEMBER(callback), + MEMBER(opaque_callback_state), MEMBER(cols), MEMBER(cols_view)); + +DEFINE_TYPE(CSVScannerColumn, "util::CSVScanner::Column", MEMBER(type), + MEMBER(ptr), MEMBER(len), MEMBER(is_null)); + +DEFINE_METHOD(peloton::codegen::util, CSVScanner, Init); +DEFINE_METHOD(peloton::codegen::util, CSVScanner, Destroy); +DEFINE_METHOD(peloton::codegen::util, CSVScanner, Produce); + +} // namespace codegen +} // namespace peloton \ No newline at end of file diff --git a/src/codegen/query_compiler.cpp b/src/codegen/query_compiler.cpp index 104e4f5783a..d6aa9912d51 100644 --- a/src/codegen/query_compiler.cpp +++ b/src/codegen/query_compiler.cpp @@ -6,7 +6,7 @@ // // Identification: src/codegen/query_compiler.cpp // -// Copyright (c) 2015-2017, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -46,6 +46,7 @@ std::unique_ptr QueryCompiler::Compile( bool QueryCompiler::IsSupported(const planner::AbstractPlan &plan) { switch (plan.GetPlanNodeType()) { case PlanNodeType::SEQSCAN: + case PlanNodeType::CSVSCAN: case PlanNodeType::ORDERBY: case PlanNodeType::DELETE: case PlanNodeType::INSERT: diff --git a/src/codegen/translator_factory.cpp b/src/codegen/translator_factory.cpp index f10fd863033..15b9dab7e7a 100644 --- a/src/codegen/translator_factory.cpp +++ b/src/codegen/translator_factory.cpp @@ -23,6 +23,7 @@ #include "codegen/expression/parameter_translator.h" #include "codegen/expression/tuple_value_translator.h" #include "codegen/operator/block_nested_loop_join_translator.h" +#include "codegen/operator/csv_scan_translator.h" #include "codegen/operator/delete_translator.h" #include "codegen/operator/global_group_by_translator.h" #include "codegen/operator/hash_group_by_translator.h" @@ -42,6 +43,7 @@ #include "expression/operator_expression.h" #include "expression/tuple_value_expression.h" #include "planner/aggregate_plan.h" +#include "planner/csv_scan_plan.h" #include "planner/delete_plan.h" #include "planner/hash_join_plan.h" #include "planner/hash_plan.h" @@ -68,6 +70,11 @@ std::unique_ptr TranslatorFactory::CreateTranslator( translator = new TableScanTranslator(scan, context, pipeline); break; } + case PlanNodeType::CSVSCAN: { + auto &scan = static_cast(plan_node); + translator = new CSVScanTranslator(scan, context, pipeline); + break; + } case PlanNodeType::PROJECTION: { auto &projection = static_cast(plan_node); diff --git a/src/codegen/type/type.cpp b/src/codegen/type/type.cpp index ed8425302ee..9b4e8e7cf1b 100644 --- a/src/codegen/type/type.cpp +++ b/src/codegen/type/type.cpp @@ -6,7 +6,7 @@ // // Identification: src/codegen/type/type.cpp // -// Copyright (c) 2015-2017, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -18,15 +18,20 @@ namespace peloton { namespace codegen { namespace type { -Type::Type() : Type(peloton::type::TypeId::INVALID, false) {} - Type::Type(peloton::type::TypeId type_id, bool _nullable) - : type_id(type_id), nullable(_nullable) {} + : type_id(type_id), nullable(_nullable) { + aux_info.varlen = 0; + aux_info.numeric_info.precision = 0; + aux_info.numeric_info.scale = 0; +} + +Type::Type() : Type(peloton::type::TypeId::INVALID, false) {} Type::Type(const SqlType &sql_type, bool _nullable) : Type(sql_type.TypeId(), _nullable) {} bool Type::operator==(const Type &other) const { + // TODO(pmenon): This isn't correct; we need to check all other fields ... return type_id == other.type_id; } diff --git a/src/codegen/util/csv_scanner.cpp b/src/codegen/util/csv_scanner.cpp new file mode 100644 index 00000000000..544269c82bd --- /dev/null +++ b/src/codegen/util/csv_scanner.cpp @@ -0,0 +1,75 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// csv_scanner.cpp +// +// Identification: src/codegen/util/csv_scanner.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include "codegen/util/csv_scanner.h" + +#include + +#include "common/exception.h" +#include "util/string_util.h" + +namespace peloton { +namespace codegen { +namespace util { + +CSVScanner::CSVScanner(const std::string &file_path, + const codegen::type::Type *col_types, uint32_t num_cols, + CSVScanner::Callback func, void *opaque_state) + : file_path_(file_path), func_(func), opaque_state_(opaque_state) { + // Initialize the columns + cols_.resize(num_cols); + for (uint32_t i = 0; i < num_cols; i++) { + cols_[i].col_type = col_types[i]; + cols_[i].ptr = nullptr; + cols_[i].is_null = false; + } + + // Setup the view. Since the Column's vector will never be resized after this + // point (it isn't possible to add or remove columns once the scan has been + // constructed), grabbing a pointer to the underlying array is safe for the + // lifetime of this scanner. + cols_view_ = cols_.data(); +} + +CSVScanner::~CSVScanner() {} + +void CSVScanner::Init(CSVScanner &scanner, const char *file_path, + const codegen::type::Type *col_types, uint32_t num_cols, + CSVScanner::Callback func, void *opaque_state) { + new (&scanner) CSVScanner(file_path, col_types, num_cols, func, opaque_state); +} + +void CSVScanner::Destroy(CSVScanner &scanner) { scanner.~CSVScanner(); } + +void CSVScanner::Produce() { InitializeScan(); } + +void CSVScanner::InitializeScan() { + // Validity checks + if (!boost::filesystem::exists(file_path_)) { + throw ExecutorException{StringUtil::Format( + "ERROR: input path '%s' does not exist", file_path_.c_str())}; + } + + if (!boost::filesystem::is_directory(file_path_)) { + throw ExecutorException{StringUtil::Format( + "ERROR: input '%s' is a directory, not a file", file_path_.c_str())}; + } + + if (!boost::filesystem::is_regular_file(file_path_)) { + throw ExecutorException{StringUtil::Format( + "ERROR: unable to read file '%s'", file_path_.c_str())}; + } +} + +} // namespace util +} // namespace codegen +} // namespace peloton \ No newline at end of file diff --git a/src/include/codegen/codegen.h b/src/include/codegen/codegen.h index 09edae81900..3dceb820715 100644 --- a/src/include/codegen/codegen.h +++ b/src/include/codegen/codegen.h @@ -16,6 +16,7 @@ #include #include "codegen/code_context.h" +#include "codegen/type/type.h" namespace peloton { namespace codegen { @@ -97,8 +98,8 @@ class CodeGen { llvm::Constant *ConstDouble(double val) const; llvm::Value *ConstString(const std::string &str_val, const std::string &name) const; - llvm::Value *ConstGenericBytes(llvm::Type *type, const void *data, - uint32_t length, + llvm::Value *ConstType(const type::Type &type); + llvm::Value *ConstGenericBytes(const void *data, uint32_t length, const std::string &name) const; llvm::Constant *Null(llvm::Type *type) const; llvm::Constant *NullPtr(llvm::PointerType *type) const; @@ -192,6 +193,9 @@ class CodeGen { private: // The context/module where all the code this class produces goes CodeContext &code_context_; + + std::unordered_map type_variables_; }; } // namespace codegen diff --git a/src/include/codegen/operator/csv_scan_translator.h b/src/include/codegen/operator/csv_scan_translator.h new file mode 100644 index 00000000000..12e132ab4ce --- /dev/null +++ b/src/include/codegen/operator/csv_scan_translator.h @@ -0,0 +1,74 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// csv_scan_translator.h +// +// Identification: src/include/codegen/operator/csv_scan_translator.h +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "codegen/operator/operator_translator.h" + +namespace peloton { + +namespace planner { +class CSVScanPlan; +} // namespace planner + +namespace codegen { +class CompilationContext; +class Pipeline; +} // namespace codegen + +namespace codegen { + +//===----------------------------------------------------------------------===// +// A translator for CSV file scans +//===----------------------------------------------------------------------===// +class CSVScanTranslator : public OperatorTranslator { + public: + // Constructor + CSVScanTranslator(const planner::CSVScanPlan &scan, + CompilationContext &context, Pipeline &pipeline); + + void InitializeState() override; + + void DefineAuxiliaryFunctions() override; + + // The method that produces new tuples + void Produce() const override; + + // Scans are leaves in the query plan and, hence, do not consume tuples + void Consume(ConsumerContext &, RowBatch &) const override {} + void Consume(ConsumerContext &, RowBatch::Row &) const override {} + + // Similar to InitializeState(), file scans don't have any state + void TearDownState() override; + + // Get a stringified version of this translator + std::string GetName() const override; + + private: + // Plan accessor + const planner::CSVScanPlan &GetScanPlan() const { return scan_; } + + llvm::Value *ConstructColumnDescriptor() const; + + private: + // The scan + const planner::CSVScanPlan &scan_; + + // The scanner state ID + RuntimeState::StateID scanner_id_; + + // The generated CSV scan consumer function + llvm::Function *consumer_func_; +}; + +} // namespace codegen +} // namespace peloton \ No newline at end of file diff --git a/src/include/codegen/proxy/csv_scanner_proxy.h b/src/include/codegen/proxy/csv_scanner_proxy.h new file mode 100644 index 00000000000..c31d871ff74 --- /dev/null +++ b/src/include/codegen/proxy/csv_scanner_proxy.h @@ -0,0 +1,47 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// csv_scanner_proxy.h +// +// Identification: src/include/codegen/proxy/csv_scanner_proxy.h +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "codegen/proxy/proxy.h" +#include "codegen/proxy/type_builder.h" +#include "codegen/util/csv_scanner.h" + +namespace peloton { +namespace codegen { + +PROXY(CSVScannerColumn) { + DECLARE_MEMBER(0, char[sizeof(type::Type)], type); + DECLARE_MEMBER(1, char *, ptr); + DECLARE_MEMBER(2, uint32_t, len); + DECLARE_MEMBER(3, bool, is_null); + DECLARE_TYPE; +}; + +PROXY(CSVScanner) { + DECLARE_MEMBER(0, char[sizeof(std::string)], file_path); + DECLARE_MEMBER(1, char[sizeof(util::CSVScanner::Callback)], callback); + DECLARE_MEMBER(2, void *, opaque_callback_state); + DECLARE_MEMBER(3, char[sizeof(std::vector)], cols); + DECLARE_MEMBER(4, util::CSVScanner::Column *, cols_view); + DECLARE_TYPE; + + DECLARE_METHOD(Init); + DECLARE_METHOD(Destroy); + DECLARE_METHOD(Produce); +}; + +TYPE_BUILDER(CSVScanner, codegen::util::CSVScanner); +TYPE_BUILDER(CSVScannerColumn, codegen::util::CSVScanner::Column); + +} // namespace codegen +} // namespace peloton \ No newline at end of file diff --git a/src/include/codegen/type/type.h b/src/include/codegen/type/type.h index d636d7d6572..1f485ad4e0e 100644 --- a/src/include/codegen/type/type.h +++ b/src/include/codegen/type/type.h @@ -15,6 +15,7 @@ #include #include "type/type_id.h" +#include "util/hash_util.h" namespace peloton { namespace codegen { @@ -78,6 +79,21 @@ class Type { Type AsNonNullable() const; }; +struct TypeHasher { + std::size_t operator()(const type::Type &type) const { + // TODO: hash the other parts + auto hash = HashUtil::Hash(&type.type_id); + hash = HashUtil::CombineHashes(hash, HashUtil::Hash(&type.nullable)); + return hash; + } +}; + +struct TypeEquality { + bool operator()(const type::Type &l, const type::Type &r) const { + return l == r; + } +}; + } // namespace type } // namespace codegen } // namespace peloton diff --git a/src/include/codegen/util/csv_scanner.h b/src/include/codegen/util/csv_scanner.h new file mode 100644 index 00000000000..711a3d13535 --- /dev/null +++ b/src/include/codegen/util/csv_scanner.h @@ -0,0 +1,65 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// csv_scanner.h +// +// Identification: src/include/codegen/util/csv_scanner.h +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +#include "codegen/type/type.h" + +namespace peloton { +namespace codegen { +namespace util { + +class CSVScanner { + public: + using Callback = void (*)(void *); + + struct Column { + codegen::type::Type col_type; + char *ptr; + uint32_t len; + bool is_null; + }; + + CSVScanner(const std::string &file_path, const codegen::type::Type *col_types, + uint32_t num_cols, Callback func, void *opaque_state); + + ~CSVScanner(); + + static void Init(CSVScanner &scanner, const char *file_path, + const codegen::type::Type *col_types, uint32_t num_cols, + Callback func, void *opaque_state); + + static void Destroy(CSVScanner &scanner); + + void Produce(); + + private: + void InitializeScan(); + + private: + // The file + const std::string file_path_; + + // The callback function and opaque state + Callback func_; + void *opaque_state_; + + std::vector cols_; + Column *cols_view_; +}; + +} // namespace util +} // namespace codegen +} // namespace peloton \ No newline at end of file diff --git a/src/include/planner/aggregate_plan.h b/src/include/planner/aggregate_plan.h index 56c0e99a6b6..51d9d8cfe42 100644 --- a/src/include/planner/aggregate_plan.h +++ b/src/include/planner/aggregate_plan.h @@ -41,7 +41,7 @@ class AggregatePlan : public AbstractPlan { bool distinct = false); // Bindings - void PerformBinding(BindingContext &binding_context); + void PerformBinding(bool is_global, BindingContext &binding_context); AggTerm Copy() const; }; diff --git a/src/include/planner/insert_plan.h b/src/include/planner/insert_plan.h index 54072e76b3d..7c2bc212e55 100644 --- a/src/include/planner/insert_plan.h +++ b/src/include/planner/insert_plan.h @@ -70,9 +70,6 @@ class InsertPlan : public AbstractPlan { std::vector>> * insert_values); - // Get a varlen pool - will construct the pool only if needed - type::AbstractPool *GetPlanPool(); - PlanNodeType GetPlanNodeType() const override { return PlanNodeType::INSERT; }; diff --git a/src/planner/aggregate_plan.cpp b/src/planner/aggregate_plan.cpp index 8aad13b3edf..26f3a7e9d19 100644 --- a/src/planner/aggregate_plan.cpp +++ b/src/planner/aggregate_plan.cpp @@ -24,7 +24,8 @@ AggregatePlan::AggTerm::AggTerm(ExpressionType et, bool distinct) : aggtype(et), expression(expr), distinct(distinct) {} -void AggregatePlan::AggTerm::PerformBinding(BindingContext &binding_context) { +void AggregatePlan::AggTerm::PerformBinding(bool is_global, + BindingContext &binding_context) { // If there's an input expression, first perform binding auto *agg_expr = const_cast(expression); if (agg_expr != nullptr) { @@ -47,7 +48,7 @@ void AggregatePlan::AggTerm::PerformBinding(BindingContext &binding_context) { // TODO: Move this logic into the SQL type const auto &input_type = expression->ResultType(); agg_ai.type = codegen::type::Type{codegen::type::Decimal::Instance(), - input_type.nullable}; + input_type.nullable || is_global}; break; } case ExpressionType::AGGREGATE_MAX: @@ -57,6 +58,9 @@ void AggregatePlan::AggTerm::PerformBinding(BindingContext &binding_context) { // return type as its input expression. PELOTON_ASSERT(expression != nullptr); agg_ai.type = expression->ResultType(); + if (is_global) { + agg_ai.type = agg_ai.type.AsNullable(); + } break; } default: { @@ -93,7 +97,7 @@ void AggregatePlan::PerformBinding(BindingContext &binding_context) { // Now let the aggregate expressions do their bindings for (const auto &agg_term : GetUniqueAggTerms()) { auto &non_const_agg_term = const_cast(agg_term); - non_const_agg_term.PerformBinding(input_context); + non_const_agg_term.PerformBinding(IsGlobal(), input_context); } // Handle the projection by creating two binding contexts, the first being @@ -117,8 +121,6 @@ void AggregatePlan::PerformBinding(BindingContext &binding_context) { const_cast(predicate) ->PerformBinding({&binding_context}); } - - } hash_t AggregatePlan::Hash( @@ -165,27 +167,22 @@ hash_t AggregatePlan::Hash() const { bool AggregatePlan::AreEqual( const std::vector &A, const std::vector &B) const { - if (A.size() != B.size()) - return false; + if (A.size() != B.size()) return false; for (size_t i = 0; i < A.size(); i++) { - if (A[i].aggtype != B[i].aggtype) - return false; + if (A[i].aggtype != B[i].aggtype) return false; auto *expr = A[i].expression; - if (expr && (*expr != *B[i].expression)) - return false; + if (expr && (*expr != *B[i].expression)) return false; - if (A[i].distinct != B[i].distinct) - return false; + if (A[i].distinct != B[i].distinct) return false; } return true; } bool AggregatePlan::operator==(const AbstractPlan &rhs) const { - if (GetPlanNodeType() != rhs.GetPlanNodeType()) - return false; + if (GetPlanNodeType() != rhs.GetPlanNodeType()) return false; auto &other = static_cast(rhs); @@ -195,12 +192,10 @@ bool AggregatePlan::operator==(const AbstractPlan &rhs) const { if ((pred == nullptr && other_pred != nullptr) || (pred != nullptr && other_pred == nullptr)) return false; - if (pred && *pred != *other_pred) - return false; + if (pred && *pred != *other_pred) return false; // UniqueAggTerms - if (!AreEqual(GetUniqueAggTerms(), other.GetUniqueAggTerms())) - return false; + if (!AreEqual(GetUniqueAggTerms(), other.GetUniqueAggTerms())) return false; // Project Info auto *proj_info = GetProjectInfo(); @@ -208,24 +203,19 @@ bool AggregatePlan::operator==(const AbstractPlan &rhs) const { if ((proj_info == nullptr && other_proj_info != nullptr) || (proj_info != nullptr && other_proj_info == nullptr)) return false; - if (proj_info && *proj_info != *other_proj_info) - return false; + if (proj_info && *proj_info != *other_proj_info) return false; // Group by size_t group_by_col_ids_count = GetGroupbyColIds().size(); - if (group_by_col_ids_count != other.GetGroupbyColIds().size()) - return false; + if (group_by_col_ids_count != other.GetGroupbyColIds().size()) return false; for (size_t i = 0; i < group_by_col_ids_count; i++) { - if (GetGroupbyColIds()[i] != other.GetGroupbyColIds()[i]) - return false; + if (GetGroupbyColIds()[i] != other.GetGroupbyColIds()[i]) return false; } - if (*GetOutputSchema() != *other.GetOutputSchema()) - return false; + if (*GetOutputSchema() != *other.GetOutputSchema()) return false; - if (GetAggregateStrategy() != other.GetAggregateStrategy()) - return false; + if (GetAggregateStrategy() != other.GetAggregateStrategy()) return false; return (AbstractPlan::operator==(rhs)); } diff --git a/src/planner/insert_plan.cpp b/src/planner/insert_plan.cpp index ff0965c8b6a..c8f0a8cc40a 100644 --- a/src/planner/insert_plan.cpp +++ b/src/planner/insert_plan.cpp @@ -205,11 +205,6 @@ void InsertPlan::SetDefaultValue(uint32_t idx) { values_.push_back(*v); } -type::AbstractPool *InsertPlan::GetPlanPool() { - if (pool_.get() == nullptr) pool_.reset(new type::EphemeralPool()); - return pool_.get(); -} - void InsertPlan::SetParameterValues(std::vector *values) { LOG_TRACE("Set Parameter Values in Insert"); auto *schema = target_table_->GetSchema(); @@ -236,15 +231,19 @@ void InsertPlan::PerformBinding(BindingContext &binding_context) { const auto &children = GetChildren(); if (children.size() == 1) { + // Let child bind children[0]->PerformBinding(binding_context); + // Pull out what we need auto *scan = static_cast(children[0].get()); - auto &col_ids = scan->GetColumnIds(); + + std::vector col_ids; + scan->GetOutputColumns(col_ids); + for (oid_t col_id = 0; col_id < col_ids.size(); col_id++) { ais_.push_back(binding_context.Find(col_id)); } } - // Binding is not required if there is no child } hash_t InsertPlan::Hash() const { From 852fd42c44cd5d4457fc10e6bacf4eaf7a54b8e0 Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Tue, 1 May 2018 16:48:52 -0400 Subject: [PATCH 14/42] Fix after rebase --- src/catalog/abstract_catalog.cpp | 1 + src/optimizer/query_to_operator_transformer.cpp | 7 ++++++- test/optimizer/optimizer_test.cpp | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/catalog/abstract_catalog.cpp b/src/catalog/abstract_catalog.cpp index 53c0b938279..9d9934a7c61 100644 --- a/src/catalog/abstract_catalog.cpp +++ b/src/catalog/abstract_catalog.cpp @@ -35,6 +35,7 @@ #include "executor/plan_executor.h" #include "executor/seq_scan_executor.h" #include "executor/update_executor.h" +#include "expression/constant_value_expression.h" #include "storage/database.h" #include "storage/storage_manager.h" diff --git a/src/optimizer/query_to_operator_transformer.cpp b/src/optimizer/query_to_operator_transformer.cpp index f5f05d6c6aa..816ef24a7fb 100644 --- a/src/optimizer/query_to_operator_transformer.cpp +++ b/src/optimizer/query_to_operator_transformer.cpp @@ -361,6 +361,10 @@ void QueryToOperatorTransformer::Visit(parser::UpdateStatement *op) { } void QueryToOperatorTransformer::Visit(parser::CopyStatement *op) { if (op->is_from) { + // The copy statement is reading from a file into a table. We construct a + // logical external-file get operator as the leaf, and an insert operator + // as the root. + auto get_op = std::make_shared(LogicalExternalFileGet::make( GetAndIncreaseGetId(), op->format, op->file_path)); @@ -368,7 +372,8 @@ void QueryToOperatorTransformer::Visit(parser::CopyStatement *op) { auto target_table = catalog::Catalog::GetInstance() ->GetDatabaseObject(op->table->GetDatabaseName(), txn_) - ->GetTableObject(op->table->GetTableName()); + ->GetTableObject(op->table->GetTableName(), + op->table->GetSchemaName()); auto insert_expr = std::make_shared( LogicalInsertSelect::make(target_table)); diff --git a/test/optimizer/optimizer_test.cpp b/test/optimizer/optimizer_test.cpp index 8b5ed1e0ec7..50696017bb5 100644 --- a/test/optimizer/optimizer_test.cpp +++ b/test/optimizer/optimizer_test.cpp @@ -20,6 +20,7 @@ #include "executor/create_executor.h" #include "executor/insert_executor.h" #include "executor/plan_executor.h" +#include "expression/constant_value_expression.h" #include "expression/tuple_value_expression.h" #include "optimizer/mock_task.h" #include "optimizer/operators.h" From b4906df32984864e55c92d830143a7c8840be326 Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Mon, 7 May 2018 23:57:10 -0400 Subject: [PATCH 15/42] file api --- src/include/util/file.h | 74 ++++++++++++++++++++++++ src/util/file.cpp | 125 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 199 insertions(+) create mode 100644 src/include/util/file.h create mode 100644 src/util/file.cpp diff --git a/src/include/util/file.h b/src/include/util/file.h new file mode 100644 index 00000000000..6bf35850674 --- /dev/null +++ b/src/include/util/file.h @@ -0,0 +1,74 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// file.h +// +// Identification: src/include/util/file.h +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include + +#include "common/exception.h" + +namespace peloton { +namespace util { + +class File { + public: + enum class AccessMode : uint8_t { ReadOnly, WriteOnly, ReadWrite }; + + File() : fd_(kInvalid) {} + + ~File() { Close(); } + + // Move + File(File &&other) noexcept : fd_(kInvalid) { std::swap(fd_, other.fd_); } + + // Move + File &operator=(File &&other) noexcept { + // First, close this file + Close(); + + // Swap descriptors + std::swap(fd_, other.fd_); + + // Done + return *this; + } + + void Open(const std::string &name, AccessMode access_mode); + + void Create(const std::string &name); + + void CreateTemp(); + + uint64_t Read(void *data, uint64_t len) const; + + uint64_t Write(void *data, uint64_t len) const; + + uint64_t Size() const; + + bool IsOpen() const { return fd_ != kInvalid; } + + void Close(); + + private: + // The file descriptor + int fd_; + + static constexpr int kInvalid = -1; + + private: + DISALLOW_COPY(File); +}; + +} // namespace util +} // namespace peloton \ No newline at end of file diff --git a/src/util/file.cpp b/src/util/file.cpp new file mode 100644 index 00000000000..de0835982c8 --- /dev/null +++ b/src/util/file.cpp @@ -0,0 +1,125 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// file.cpp +// +// Identification: src/util/file.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include "util/file.h" + +#include "util/string_util.h" + +namespace peloton { +namespace util { + +void File::Open(const std::string &name, File::AccessMode access_mode) { + // Close the existing file if it's open + Close(); + + int flags; + switch (access_mode) { + case AccessMode::ReadOnly: { + flags = O_RDWR; + break; + } + case AccessMode::WriteOnly: { + flags = O_WRONLY; + break; + } + case AccessMode::ReadWrite: { + flags = O_RDWR; + break; + } + } + + // Open + int fd = open(name.c_str(), flags); + + // Check error + if (fd == -1) { + throw Exception{ + StringUtil::Format("Unable to read file '%s'", name.c_str())}; + } + + // Done + fd_ = fd; +} + +uint64_t File::Read(void *data, uint64_t len) const { + // Ensure open + PELOTON_ASSERT(IsOpen()); + + // Perform read + ssize_t bytes_read = read(fd_, data, len); + + // Check error + if (bytes_read == -1) { + throw Exception{ + StringUtil::Format("Error reading file: %s", strerror(errno))}; + } + + // Done + return static_cast(bytes_read); +} + +uint64_t File::Write(void *data, uint64_t len) const { + // Ensure open + PELOTON_ASSERT(IsOpen()); + + // Perform write + ssize_t bytes_written = write(fd_, data, len); + + // Check error + if (bytes_written == -1) { + throw Exception{ + StringUtil::Format("Error writing to file: %s", strerror(errno))}; + } + + // Done + return static_cast(bytes_written); +} + +uint64_t File::Size() const { + // Ensure open + PELOTON_ASSERT(IsOpen()); + + // Save the current position + off_t curr_off = lseek(fd_, 0, SEEK_CUR); + if (curr_off == -1) { + throw Exception{StringUtil::Format( + "unable to read current position in file: %s", strerror(errno))}; + } + + // Seek to the end of the file, returning the new file position i.e., the + // size of the file in bytes. + off_t off = lseek(fd_, 0, SEEK_END); + if (off == -1) { + throw Exception{StringUtil::Format( + "unable to move file position to end file: %s", strerror(errno))}; + } + + off_t restore = lseek(fd_, curr_off, SEEK_SET); + if (restore == -1) { + throw Exception{StringUtil::Format( + "unable to restore position after moving to the end: %s", + strerror(errno))}; + } + + // Restore position + return static_cast(off); +} + +void File::Close() { + if (IsOpen()) { + close(fd_); + fd_ = kInvalid; + } +} + +} // namespace util +} // namespace peloton \ No newline at end of file From b41b863a67f83823d4cff272f2dc4215ec931103 Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Tue, 8 May 2018 01:04:25 -0400 Subject: [PATCH 16/42] CSV scanner reads lines --- src/codegen/operator/csv_scan_translator.cpp | 10 +- src/codegen/proxy/csv_scanner_proxy.cpp | 7 +- src/codegen/util/csv_scanner.cpp | 226 +++++++++++++++--- src/include/codegen/proxy/csv_scanner_proxy.h | 13 +- src/include/codegen/util/csv_scanner.h | 169 ++++++++++++- src/include/planner/csv_scan_plan.h | 19 +- 6 files changed, 392 insertions(+), 52 deletions(-) diff --git a/src/codegen/operator/csv_scan_translator.cpp b/src/codegen/operator/csv_scan_translator.cpp index e38525ada35..480ad45e479 100644 --- a/src/codegen/operator/csv_scan_translator.cpp +++ b/src/codegen/operator/csv_scan_translator.cpp @@ -39,6 +39,7 @@ void CSVScanTranslator::InitializeState() { // Arguments auto *scanner_ptr = LoadStatePtr(scanner_id_); + auto *exec_ctx_ptr = GetCompilationContext().GetExecutorContextPtr(); auto *file_path = codegen.ConstString(scan_.GetFileName(), "filePath"); auto *output_col_types = ConstructColumnDescriptor(); auto *runtime_state_ptr = codegen->CreatePointerCast( @@ -50,12 +51,15 @@ void CSVScanTranslator::InitializeState() { codegen.Const32(static_cast(out_cols.size())); auto *consumer_func = codegen->CreatePointerCast( - consumer_func_, proxy::TypeBuilder::GetType(codegen)); + consumer_func_, proxy::TypeBuilder::GetType(codegen)); // Call codegen.Call(CSVScannerProxy::Init, - {scanner_ptr, file_path, output_col_types, num_output_cols, - consumer_func, runtime_state_ptr}); + {scanner_ptr, exec_ctx_ptr, file_path, output_col_types, + num_output_cols, consumer_func, runtime_state_ptr, + codegen.Const8(scan_.GetDelimiterChar()), + codegen.Const8(scan_.GetQuoteChar()), + codegen.Const8(scan_.GetEscapeChar())}); } void CSVScanTranslator::DefineAuxiliaryFunctions() { diff --git a/src/codegen/proxy/csv_scanner_proxy.cpp b/src/codegen/proxy/csv_scanner_proxy.cpp index 89b4b7dca16..f57a11fe014 100644 --- a/src/codegen/proxy/csv_scanner_proxy.cpp +++ b/src/codegen/proxy/csv_scanner_proxy.cpp @@ -12,16 +12,17 @@ #include "codegen/proxy/csv_scanner_proxy.h" +#include "codegen/proxy/executor_context_proxy.h" #include "codegen/proxy/runtime_functions_proxy.h" namespace peloton { namespace codegen { -DEFINE_TYPE(CSVScanner, "util::CSVScanner", MEMBER(file_path), MEMBER(callback), - MEMBER(opaque_callback_state), MEMBER(cols), MEMBER(cols_view)); +DEFINE_TYPE(CSVScanner, "util::CSVScanner", MEMBER(opaque1), MEMBER(cols), + MEMBER(opaque2)); DEFINE_TYPE(CSVScannerColumn, "util::CSVScanner::Column", MEMBER(type), - MEMBER(ptr), MEMBER(len), MEMBER(is_null)); + MEMBER(ptr), MEMBER(len), MEMBER(is_null)); DEFINE_METHOD(peloton::codegen::util, CSVScanner, Init); DEFINE_METHOD(peloton::codegen::util, CSVScanner, Destroy); diff --git a/src/codegen/util/csv_scanner.cpp b/src/codegen/util/csv_scanner.cpp index 544269c82bd..b0038563339 100644 --- a/src/codegen/util/csv_scanner.cpp +++ b/src/codegen/util/csv_scanner.cpp @@ -15,59 +15,231 @@ #include #include "common/exception.h" +#include "executor/executor_context.h" +#include "type/abstract_pool.h" #include "util/string_util.h" namespace peloton { namespace codegen { namespace util { -CSVScanner::CSVScanner(const std::string &file_path, +CSVScanner::CSVScanner(peloton::type::AbstractPool &pool, + const std::string &file_path, const codegen::type::Type *col_types, uint32_t num_cols, - CSVScanner::Callback func, void *opaque_state) - : file_path_(file_path), func_(func), opaque_state_(opaque_state) { + CSVScanner::Callback func, void *opaque_state, + char delimiter, char quote, char escape) + : memory_(pool), + file_path_(file_path), + file_(), + buffer_(nullptr), + buffer_begin_(0), + buffer_end_(0), + line_(nullptr), + line_len_(0), + line_maxlen_(0), + delimiter_(delimiter), + quote_(quote), + escape_(escape), + func_(func), + opaque_state_(opaque_state) { + // Make column array + cols_ = static_cast( + memory_.Allocate(sizeof(CSVScanner::Column) * num_cols)); + // Initialize the columns - cols_.resize(num_cols); for (uint32_t i = 0; i < num_cols; i++) { cols_[i].col_type = col_types[i]; cols_[i].ptr = nullptr; + cols_[i].len = 0; cols_[i].is_null = false; } - - // Setup the view. Since the Column's vector will never be resized after this - // point (it isn't possible to add or remove columns once the scan has been - // constructed), grabbing a pointer to the underlying array is safe for the - // lifetime of this scanner. - cols_view_ = cols_.data(); } -CSVScanner::~CSVScanner() {} +CSVScanner::~CSVScanner() { + if (buffer_ != nullptr) { + memory_.Free(buffer_); + } + if (line_ != nullptr) { + memory_.Free(line_); + } + if (cols_ != nullptr) { + memory_.Free(cols_); + } +} -void CSVScanner::Init(CSVScanner &scanner, const char *file_path, +void CSVScanner::Init(CSVScanner &scanner, + executor::ExecutorContext &executor_context, + const char *file_path, const codegen::type::Type *col_types, uint32_t num_cols, - CSVScanner::Callback func, void *opaque_state) { - new (&scanner) CSVScanner(file_path, col_types, num_cols, func, opaque_state); + CSVScanner::Callback func, void *opaque_state, + char delimiter, char quote, char escape) { + // Forward to constructor + new (&scanner) + CSVScanner(*executor_context.GetPool(), file_path, col_types, num_cols, + func, opaque_state, delimiter, quote, escape); } -void CSVScanner::Destroy(CSVScanner &scanner) { scanner.~CSVScanner(); } +void CSVScanner::Destroy(CSVScanner &scanner) { + // Forward to destructor + scanner.~CSVScanner(); +} -void CSVScanner::Produce() { InitializeScan(); } +void CSVScanner::Produce() { + // Initialize + Initialize(); -void CSVScanner::InitializeScan() { - // Validity checks - if (!boost::filesystem::exists(file_path_)) { - throw ExecutorException{StringUtil::Format( - "ERROR: input path '%s' does not exist", file_path_.c_str())}; + // Loop lines + while (const char *line = NextLine()) { + ProduceCSV(line); } +} + +void CSVScanner::Initialize() { + // Let's first perform a few validity checks + boost::filesystem::path path{file_path_}; - if (!boost::filesystem::is_directory(file_path_)) { - throw ExecutorException{StringUtil::Format( - "ERROR: input '%s' is a directory, not a file", file_path_.c_str())}; + if (!boost::filesystem::exists(path)) { + throw ExecutorException{StringUtil::Format("input path '%s' does not exist", + file_path_.c_str())}; + } else if (!boost::filesystem::is_regular_file(file_path_)) { + throw ExecutorException{ + StringUtil::Format("unable to read file '%s'", file_path_.c_str())}; } - if (!boost::filesystem::is_regular_file(file_path_)) { - throw ExecutorException{StringUtil::Format( - "ERROR: unable to read file '%s'", file_path_.c_str())}; + // The path looks okay, let's try opening it + file_.Open(file_path_, peloton::util::File::AccessMode::ReadOnly); + + // Allocate buffer space + buffer_ = static_cast(memory_.Allocate(kDefaultBufferSize + 1)); + + // Fill read-buffer + NextBuffer(); + + // Allocate space for the full line, if it doesn't fit into the buffer + line_ = static_cast(memory_.Allocate(kDefaultBufferSize)); + line_len_ = 0; + line_maxlen_ = kDefaultBufferSize; +} + +bool CSVScanner::NextBuffer() { + // Do read + buffer_begin_ = 0; + buffer_end_ = static_cast(file_.Read(buffer_, kDefaultBufferSize)); + + // Update stats + stats_.num_reads++; + + return (buffer_end_ != 0); +} + +void CSVScanner::AppendToCurrentLine(const char *data, uint32_t len) { + // Short-circuit if we're not appending any data + if (len == 0) { + return; + } + + if (line_len_ + len > line_maxlen_) { + // The current line buffer isn't large enough to store the new bytes, so we + // need to resize it. By default, we double the capacity. + auto new_maxlen = line_maxlen_ * 2; + while (new_maxlen < len) { + new_maxlen *= 2; + } + auto *new_line = static_cast(memory_.Allocate(new_maxlen)); + + // Copy the old data + PELOTON_MEMCPY(new_line, line_, line_len_); + + // Setup pointers and sizes + line_ = new_line; + line_maxlen_ = new_maxlen; + + stats_.num_reallocs++; } + + // At this point, we've guaranteed that the line is large enough to + // accommodate the new bytes, so let's go ahead and perform the copy. + + PELOTON_MEMCPY(line_ + line_len_, data, len); + + // Increase the length of the line + line_len_ += len; + + // Track copy stats + stats_.num_copies++; +} + +// The main purpose of this function is to find the start of the next line in +// the CSV file. +const char *CSVScanner::NextLine() { + line_len_ = 0; + + bool in_quote = false; + bool last_was_escape = false; + bool copied_to_line_buf = false; + + uint32_t line_end = buffer_begin_; + + while (true) { + if (line_end >= buffer_end_) { + // We need to read more data from the CSV file. But first, we need to copy + // all the data in the read-buffer (i.e., [buffer_begin_, buffer_end_] to + // the line-buffer. + + AppendToCurrentLine(buffer_ + buffer_begin_, + static_cast(buffer_end_ - buffer_begin_)); + + // Now, read more data + if (!NextBuffer()) { + return nullptr; + } + + // Reset positions + line_end = buffer_begin_; + copied_to_line_buf = true; + } + + // Read character + char c = buffer_[line_end]; + + if (in_quote && c == escape_) { + last_was_escape = true; + } + if (c == quote_ && !last_was_escape) { + in_quote = true; + } + if (c != escape_) { + last_was_escape = false; + } + + // Process the new-line character. If we a new-line and we're not currently + // in a quoted section, we're done. + if (c == '\n' && !in_quote) { + buffer_[line_end] = '\0'; + break; + } + + // Move along + line_end++; + } + + // Increment line number + line_number_++; + + if (copied_to_line_buf) { + AppendToCurrentLine(buffer_, line_end); + buffer_begin_ = line_end + 1; + return line_; + } else { + const char *ret = buffer_ + buffer_begin_; + buffer_begin_ = line_end + 1; + return ret; + } +} + +void CSVScanner::ProduceCSV(UNUSED_ATTRIBUTE const char *line) { + // TODO: me + func_(opaque_state_); } } // namespace util diff --git a/src/include/codegen/proxy/csv_scanner_proxy.h b/src/include/codegen/proxy/csv_scanner_proxy.h index c31d871ff74..fabcfe9d953 100644 --- a/src/include/codegen/proxy/csv_scanner_proxy.h +++ b/src/include/codegen/proxy/csv_scanner_proxy.h @@ -12,9 +12,11 @@ #pragma once +#include "codegen/proxy/pool_proxy.h" #include "codegen/proxy/proxy.h" #include "codegen/proxy/type_builder.h" #include "codegen/util/csv_scanner.h" +#include "util/file.h" namespace peloton { namespace codegen { @@ -28,11 +30,12 @@ PROXY(CSVScannerColumn) { }; PROXY(CSVScanner) { - DECLARE_MEMBER(0, char[sizeof(std::string)], file_path); - DECLARE_MEMBER(1, char[sizeof(util::CSVScanner::Callback)], callback); - DECLARE_MEMBER(2, void *, opaque_callback_state); - DECLARE_MEMBER(3, char[sizeof(std::vector)], cols); - DECLARE_MEMBER(4, util::CSVScanner::Column *, cols_view); + DECLARE_MEMBER(0, char[sizeof(codegen::util::CSVScanner) - + sizeof(util::CSVScanner::Column *) - + sizeof(util::CSVScanner::Stats) - 4], + opaque1); + DECLARE_MEMBER(1, util::CSVScanner::Column *, cols); + DECLARE_MEMBER(2, char[sizeof(util::CSVScanner::Stats) + 4], opaque2); DECLARE_TYPE; DECLARE_METHOD(Init); diff --git a/src/include/codegen/util/csv_scanner.h b/src/include/codegen/util/csv_scanner.h index 711a3d13535..56d64f1f371 100644 --- a/src/include/codegen/util/csv_scanner.h +++ b/src/include/codegen/util/csv_scanner.h @@ -16,48 +16,195 @@ #include #include "codegen/type/type.h" +#include "util/file.h" namespace peloton { + +namespace executor { +class ExecutorContext; +} // namespace executor + +namespace type { +class AbstractPool; +} // namespace type + namespace codegen { namespace util { +/** + * This is the primary class to scan CSV files. Callers use the constructor to + * configure various aspects of how parsing is performed. Callers must provide + * a description of the rows stored in the CSV file, and a callback function + * that is invoked once for every row in the CSV file. The delimiter character, + * quoting character, and escape characters can also be configured through the + * constructor. + * + * This scanner class is fail-fast. If it finds an ill-formatted row, it will + * immediately throw an error. + * + * TODO: implement a more generous parser that is best-effort. + */ class CSVScanner { public: + // 64K buffer size + static constexpr uint32_t kDefaultBufferSize = (1ul << 16); + + // The signature of the callback function using Callback = void (*)(void *); + /** + * Column information + */ struct Column { + // The type of data this column represents codegen::type::Type col_type; + + // A pointer to where the next value of this column is char *ptr; + + // The number of bytes uint32_t len; + + // Is the next value of this column NULL bool is_null; }; - CSVScanner(const std::string &file_path, const codegen::type::Type *col_types, - uint32_t num_cols, Callback func, void *opaque_state); + /** + * Various statistics tracked while we scan the CSV + */ + struct Stats { + // The number of times the read-buffer was copied into the line-buffer + uint32_t num_copies = 0; + // The number of times we had to re-allocate the line-buffer to make room + // for new data (i.e., to handle really long lines that don't fit into the + // read-buffer) + uint32_t num_reallocs = 0; + // The number of times we had to call Read() from the file + uint32_t num_reads = 0; + }; + /** + * Constructor. + * + * @param memory A memory pool where all allocations are sourced from + * @param file_path The full path to the CSV file + * @param col_types A description of the rows stored in the CSV + * @param num_cols The number of columns to expect + * @param func The callback function to invoke per row/line in the CSV + * @param opaque_state An opaque state that is passed to the callback function + * upon invocation. + * @param delimiter The character that separates columns within a row + * @param quote The quoting character used to quote data (i.e., strings) + * @param escape The character that appears before any data characters that + * are the same as the quote character. + */ + CSVScanner(peloton::type::AbstractPool &memory, const std::string &file_path, + const codegen::type::Type *col_types, uint32_t num_cols, + Callback func, void *opaque_state, char delimiter = ',', + char quote = '"', char escape = '"'); + + /** + * Destructor + */ ~CSVScanner(); - static void Init(CSVScanner &scanner, const char *file_path, - const codegen::type::Type *col_types, uint32_t num_cols, - Callback func, void *opaque_state); - + /** + * Initialization function. This is the entry point from codegen to initialize + * scanner instances. + * + * @param scanner The scanner we're initializing + * @param memory A memory pool where all allocations are sourced from + * @param file_path The full path to the CSV file + * @param col_types A description of the rows stored in the CSV + * @param num_cols The number of columns to expect + * @param func The callback function to invoke per row/line in the CSV + * @param opaque_state An opaque state that is passed to the callback function + * upon invocation. + * @param delimiter The character that separates columns within a row + * @param quote The quoting character used to quote data (i.e., strings) + * @param escape The character that appears before any data characters that + * are the same as the quote character. + */ + static void Init(CSVScanner &scanner, + executor::ExecutorContext &executor_context, + const char *file_path, const codegen::type::Type *col_types, + uint32_t num_cols, Callback func, void *opaque_state, + char delimiter, char quote, char escape); + + /** + * Destruction function. This is the entry point from codegen when cleaning up + * and reclaiming memory from scanner instances. + * + * @param scanner The scanner we're destroying. + */ static void Destroy(CSVScanner &scanner); + /** + * Produce all the rows stored in the configured CSV file + */ void Produce(); + /** + * Return the list of columns + * + * @return + */ + const Column *GetColumns() const { return cols_; } + private: - void InitializeScan(); + // Initialize the scan + void Initialize(); + + // Append bytes to the end of the currently accruing line. + void AppendToCurrentLine(const char *data, uint32_t len); + + // Read the next line from the CSV file + const char *NextLine(); + + // Read a buffer's worth of data from the CSV file + bool NextBuffer(); + + // Produce CSV data stored in the provided line + void ProduceCSV(const char *line); private: - // The file + // All memory allocations happen from this pool + peloton::type::AbstractPool &memory_; + + // The path to the CSV file const std::string file_path_; - // The callback function and opaque state + // The CSV file handle + peloton::util::File file_; + + // The temporary buffer where raw file contents are read into + // TODO: make these unique_ptr's with a customer deleter + char *buffer_; + uint32_t buffer_begin_; + uint32_t buffer_end_; + + // A pointer to the start of a line in the CSV file + char *line_; + uint32_t line_len_; + uint32_t line_maxlen_; + + // Line number + uint32_t line_number_; + + // The column delimiter, quote, and escape characters configured for this CSV + char delimiter_; + char quote_; + char escape_; + + // The callback function to call for each row of the CSV, and an opaque state Callback func_; void *opaque_state_; - std::vector cols_; - Column *cols_view_; + // The columns + Column *cols_; + + // Statistics + Stats stats_; }; } // namespace util diff --git a/src/include/planner/csv_scan_plan.h b/src/include/planner/csv_scan_plan.h index 1c14a1d9ece..f9611b22630 100644 --- a/src/include/planner/csv_scan_plan.h +++ b/src/include/planner/csv_scan_plan.h @@ -35,7 +35,8 @@ class CSVScanPlan : public AbstractScan { * @param file_name The file path * @param cols Information of the columns expected in each row of the CSV */ - CSVScanPlan(std::string file_name, std::vector &&cols); + CSVScanPlan(std::string file_name, std::vector &&cols, + char delimiter = ',', char quote = '"', char escape = '"'); ////////////////////////////////////////////////////////////////////////////// /// @@ -51,6 +52,10 @@ class CSVScanPlan : public AbstractScan { void GetAttributes(std::vector &ais) const override; + char GetDelimiterChar() const { return delimiter_; } + char GetQuoteChar() const { return quote_; } + char GetEscapeChar() const { return escape_; } + ////////////////////////////////////////////////////////////////////////////// /// /// Utilities + Internal @@ -68,6 +73,10 @@ class CSVScanPlan : public AbstractScan { private: const std::string file_name_; + char delimiter_; + char quote_; + char escape_; + std::vector> attributes_; }; @@ -78,8 +87,12 @@ class CSVScanPlan : public AbstractScan { //////////////////////////////////////////////////////////////////////////////// inline CSVScanPlan::CSVScanPlan(std::string file_name, - std::vector &&cols) - : file_name_(std::move(file_name)) { + std::vector &&cols, + char delimiter, char quote, char escape) + : file_name_(std::move(file_name)), + delimiter_(delimiter), + quote_(quote), + escape_(escape) { for (const auto &col : cols) { std::unique_ptr attribute{ new planner::AttributeInfo()}; From 93f39fc676086ae758b028ee450834a855555aaa Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Tue, 8 May 2018 01:51:05 -0400 Subject: [PATCH 17/42] Process CSV line in scanner --- src/codegen/operator/csv_scan_translator.cpp | 2 +- src/codegen/util/csv_scanner.cpp | 25 ++++++++++++++----- src/include/codegen/proxy/csv_scanner_proxy.h | 7 +++--- src/include/codegen/util/csv_scanner.h | 3 ++- 4 files changed, 25 insertions(+), 12 deletions(-) diff --git a/src/codegen/operator/csv_scan_translator.cpp b/src/codegen/operator/csv_scan_translator.cpp index 480ad45e479..3d87dd9dbe7 100644 --- a/src/codegen/operator/csv_scan_translator.cpp +++ b/src/codegen/operator/csv_scan_translator.cpp @@ -88,7 +88,7 @@ void CSVScanTranslator::DefineAuxiliaryFunctions() { // Load the pointer to the columns view auto *cols = codegen->CreateLoad(codegen->CreateConstInBoundsGEP2_32( - CSVScannerProxy::GetType(codegen), LoadStatePtr(scanner_id_), 0, 4)); + CSVScannerProxy::GetType(codegen), LoadStatePtr(scanner_id_), 0, 1)); // For each column, call the type's input function to read the input value for (uint32_t i = 0; i < output_attributes.size(); i++) { diff --git a/src/codegen/util/csv_scanner.cpp b/src/codegen/util/csv_scanner.cpp index b0038563339..6c5f23ee61d 100644 --- a/src/codegen/util/csv_scanner.cpp +++ b/src/codegen/util/csv_scanner.cpp @@ -41,13 +41,13 @@ CSVScanner::CSVScanner(peloton::type::AbstractPool &pool, quote_(quote), escape_(escape), func_(func), - opaque_state_(opaque_state) { + opaque_state_(opaque_state), + num_cols_(num_cols) { // Make column array - cols_ = static_cast( - memory_.Allocate(sizeof(CSVScanner::Column) * num_cols)); + cols_ = static_cast(memory_.Allocate(sizeof(Column) * num_cols_)); // Initialize the columns - for (uint32_t i = 0; i < num_cols; i++) { + for (uint32_t i = 0; i < num_cols_; i++) { cols_[i].col_type = col_types[i]; cols_[i].ptr = nullptr; cols_[i].len = 0; @@ -237,8 +237,21 @@ const char *CSVScanner::NextLine() { } } -void CSVScanner::ProduceCSV(UNUSED_ATTRIBUTE const char *line) { - // TODO: me +void CSVScanner::ProduceCSV(const char *line) { + // At this point, we have a well-formed line. Let's pull out pointers to the + // columns. + + const auto *iter = line; + for (uint32_t col_idx = 0; col_idx < num_cols_; col_idx++) { + const char *start = iter; + for (; *iter != 0 && *iter != delimiter_; iter++) {} + cols_[col_idx].ptr = start; + cols_[col_idx].len = static_cast(iter - start); + cols_[col_idx].is_null = (cols_[col_idx].len == 0); + iter++; + } + + // Invoke callback func_(opaque_state_); } diff --git a/src/include/codegen/proxy/csv_scanner_proxy.h b/src/include/codegen/proxy/csv_scanner_proxy.h index fabcfe9d953..ae9b13cd7ec 100644 --- a/src/include/codegen/proxy/csv_scanner_proxy.h +++ b/src/include/codegen/proxy/csv_scanner_proxy.h @@ -12,11 +12,9 @@ #pragma once -#include "codegen/proxy/pool_proxy.h" #include "codegen/proxy/proxy.h" #include "codegen/proxy/type_builder.h" #include "codegen/util/csv_scanner.h" -#include "util/file.h" namespace peloton { namespace codegen { @@ -32,10 +30,11 @@ PROXY(CSVScannerColumn) { PROXY(CSVScanner) { DECLARE_MEMBER(0, char[sizeof(codegen::util::CSVScanner) - sizeof(util::CSVScanner::Column *) - - sizeof(util::CSVScanner::Stats) - 4], + sizeof(util::CSVScanner::Stats) - sizeof(uint32_t)], opaque1); DECLARE_MEMBER(1, util::CSVScanner::Column *, cols); - DECLARE_MEMBER(2, char[sizeof(util::CSVScanner::Stats) + 4], opaque2); + DECLARE_MEMBER(2, char[sizeof(util::CSVScanner::Stats) + sizeof(uint32_t)], + opaque2); DECLARE_TYPE; DECLARE_METHOD(Init); diff --git a/src/include/codegen/util/csv_scanner.h b/src/include/codegen/util/csv_scanner.h index 56d64f1f371..a69c673054b 100644 --- a/src/include/codegen/util/csv_scanner.h +++ b/src/include/codegen/util/csv_scanner.h @@ -60,7 +60,7 @@ class CSVScanner { codegen::type::Type col_type; // A pointer to where the next value of this column is - char *ptr; + const char *ptr; // The number of bytes uint32_t len; @@ -202,6 +202,7 @@ class CSVScanner { // The columns Column *cols_; + uint32_t num_cols_; // Statistics Stats stats_; From 2aa0aa16a35250e4fd47517181b1356f2b9539f7 Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Wed, 9 May 2018 02:02:52 -0400 Subject: [PATCH 18/42] Free memory when re-allocating line buffer --- src/codegen/util/csv_scanner.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/codegen/util/csv_scanner.cpp b/src/codegen/util/csv_scanner.cpp index 6c5f23ee61d..a7140ad322d 100644 --- a/src/codegen/util/csv_scanner.cpp +++ b/src/codegen/util/csv_scanner.cpp @@ -150,6 +150,9 @@ void CSVScanner::AppendToCurrentLine(const char *data, uint32_t len) { // Copy the old data PELOTON_MEMCPY(new_line, line_, line_len_); + // Free old old + memory_.Free(line_); + // Setup pointers and sizes line_ = new_line; line_maxlen_ = new_maxlen; From 78c080541d605f2364d667a85c183ac5a7fda1ea Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Wed, 9 May 2018 02:04:30 -0400 Subject: [PATCH 19/42] Added memcmp to codegen interface. Renamed CallPrintf() to Printf(). --- src/codegen/codegen.cpp | 19 ++++++++++++++++--- src/include/codegen/codegen.h | 6 ++++-- src/include/codegen/util/csv_scanner.h | 8 ++++---- test/codegen/testing_codegen_util.cpp | 2 +- test/codegen/value_integrity_test.cpp | 4 ++-- 5 files changed, 27 insertions(+), 12 deletions(-) diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp index 6a96a0f7542..e0082f7d588 100644 --- a/src/codegen/codegen.cpp +++ b/src/codegen/codegen.cpp @@ -159,12 +159,12 @@ llvm::Value *CodeGen::CallFunc(llvm::Value *fn, return GetBuilder().CreateCall(fn, args); } -llvm::Value *CodeGen::CallPrintf(const std::string &format, - const std::vector &args) { +llvm::Value *CodeGen::Printf(const std::string &format, + const std::vector &args) { auto *printf_fn = LookupBuiltin("printf"); if (printf_fn == nullptr) { printf_fn = RegisterBuiltin( - "printf", llvm::TypeBuilder::get(GetContext()), + "printf", llvm::TypeBuilder::get(GetContext()), reinterpret_cast(printf)); } @@ -176,6 +176,19 @@ llvm::Value *CodeGen::CallPrintf(const std::string &format, return CallFunc(printf_fn, printf_args); } +llvm::Value *CodeGen::Memcmp(llvm::Value *ptr1, llvm::Value *ptr2, + llvm::Value *len) { + static constexpr char kMemcmpFnName[] = "memcmp"; + auto *memcmp_fn = LookupBuiltin(kMemcmpFnName); + if (memcmp_fn == nullptr) { + memcmp_fn = RegisterBuiltin( + kMemcmpFnName, + llvm::TypeBuilder::get(GetContext()), + reinterpret_cast(printf)); + } + return CallFunc(memcmp_fn, {ptr1, ptr2, len}); +} + llvm::Value *CodeGen::Sqrt(llvm::Value *val) { llvm::Function *sqrt_func = llvm::Intrinsic::getDeclaration( &GetModule(), llvm::Intrinsic::sqrt, val->getType()); diff --git a/src/include/codegen/codegen.h b/src/include/codegen/codegen.h index 3dceb820715..037e01dbe11 100644 --- a/src/include/codegen/codegen.h +++ b/src/include/codegen/codegen.h @@ -131,8 +131,10 @@ class CodeGen { //===--------------------------------------------------------------------===// // C/C++ standard library functions //===--------------------------------------------------------------------===// - llvm::Value *CallPrintf(const std::string &format, - const std::vector &args); + llvm::Value *Printf(const std::string &format, + const std::vector &args); + llvm::Value *Memcmp(llvm::Value *ptr1, llvm::Value *ptr2, + llvm::Value *len); llvm::Value *Sqrt(llvm::Value *val); //===--------------------------------------------------------------------===// diff --git a/src/include/codegen/util/csv_scanner.h b/src/include/codegen/util/csv_scanner.h index a69c673054b..d51475c4c43 100644 --- a/src/include/codegen/util/csv_scanner.h +++ b/src/include/codegen/util/csv_scanner.h @@ -95,8 +95,8 @@ class CSVScanner { * upon invocation. * @param delimiter The character that separates columns within a row * @param quote The quoting character used to quote data (i.e., strings) - * @param escape The character that appears before any data characters that - * are the same as the quote character. + * @param escape The character that should appear before any data characters + * that match the quote character. */ CSVScanner(peloton::type::AbstractPool &memory, const std::string &file_path, const codegen::type::Type *col_types, uint32_t num_cols, @@ -122,8 +122,8 @@ class CSVScanner { * upon invocation. * @param delimiter The character that separates columns within a row * @param quote The quoting character used to quote data (i.e., strings) - * @param escape The character that appears before any data characters that - * are the same as the quote character. + * @param escape The character that should appear before any data characters + * that match the quote character. */ static void Init(CSVScanner &scanner, executor::ExecutorContext &executor_context, diff --git a/test/codegen/testing_codegen_util.cpp b/test/codegen/testing_codegen_util.cpp index 5302eae3daf..316b46331d6 100644 --- a/test/codegen/testing_codegen_util.cpp +++ b/test/codegen/testing_codegen_util.cpp @@ -446,7 +446,7 @@ void Printer::ConsumeResult(codegen::ConsumerContext &ctx, format.append("]\n"); // Make the printf call - codegen.CallPrintf(format, cols); + codegen.Printf(format, cols); } } // namespace test diff --git a/test/codegen/value_integrity_test.cpp b/test/codegen/value_integrity_test.cpp index 4c4ccf97690..9c78ece4787 100644 --- a/test/codegen/value_integrity_test.cpp +++ b/test/codegen/value_integrity_test.cpp @@ -59,7 +59,7 @@ void DivideByZeroTest(const codegen::type::Type &data_type, ExpressionType op) { } } - codegen.CallPrintf("%lu\n", {res.GetValue()}); + codegen.Printf("%lu\n", {res.GetValue()}); function.ReturnAndFinish(); } @@ -126,7 +126,7 @@ void OverflowTest(const codegen::type::Type &data_type, ExpressionType op) { } } - codegen.CallPrintf("%lu\n", {res.GetValue()}); + codegen.Printf("%lu\n", {res.GetValue()}); function.ReturnAndFinish(); } From f46634a482a98a5842edffb03929016499f3985c Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Wed, 9 May 2018 02:05:11 -0400 Subject: [PATCH 20/42] Cleaned up CSV scan translator. Added null checking. --- src/codegen/operator/csv_scan_translator.cpp | 241 ++++++++++++------ src/codegen/values_runtime.cpp | 31 ++- .../codegen/operator/csv_scan_translator.h | 11 +- src/include/codegen/proxy/csv_scanner_proxy.h | 3 +- src/include/planner/csv_scan_plan.h | 37 ++- 5 files changed, 205 insertions(+), 118 deletions(-) diff --git a/src/codegen/operator/csv_scan_translator.cpp b/src/codegen/operator/csv_scan_translator.cpp index 3d87dd9dbe7..d1c191a9ccd 100644 --- a/src/codegen/operator/csv_scan_translator.cpp +++ b/src/codegen/operator/csv_scan_translator.cpp @@ -29,111 +29,199 @@ CSVScanTranslator::CSVScanTranslator(const planner::CSVScanPlan &scan, CompilationContext &context, Pipeline &pipeline) : OperatorTranslator(context, pipeline), scan_(scan) { + // Register the CSV scanner instance auto &runtime_state = context.GetRuntimeState(); scanner_id_ = runtime_state.RegisterState( "csvScanner", CSVScannerProxy::GetType(GetCodeGen())); + + // Load information about the attributes output by the scan plan + scan_.GetAttributes(output_attributes_); } void CSVScanTranslator::InitializeState() { auto &codegen = GetCodeGen(); // Arguments - auto *scanner_ptr = LoadStatePtr(scanner_id_); - auto *exec_ctx_ptr = GetCompilationContext().GetExecutorContextPtr(); - auto *file_path = codegen.ConstString(scan_.GetFileName(), "filePath"); - auto *output_col_types = ConstructColumnDescriptor(); - auto *runtime_state_ptr = codegen->CreatePointerCast( - codegen.GetState(), codegen.VoidType()->getPointerTo()); + llvm::Value *scanner_ptr = LoadStatePtr(scanner_id_); + llvm::Value *exec_ctx_ptr = GetCompilationContext().GetExecutorContextPtr(); + llvm::Value *file_path = codegen.ConstString(scan_.GetFileName(), "filePath"); - std::vector out_cols; - scan_.GetOutputColumns(out_cols); - auto *num_output_cols = - codegen.Const32(static_cast(out_cols.size())); + auto num_cols = static_cast(output_attributes_.size()); - auto *consumer_func = codegen->CreatePointerCast( - consumer_func_, proxy::TypeBuilder::GetType(codegen)); + // We need to generate an array of type::Type. To do so, we construct a vector + // of the types of the output columns, and we create an LLVM constant that is + // a copy of the underlying bytes. + + std::vector col_types_vec; + col_types_vec.reserve(num_cols); + for (const auto *ai : output_attributes_) { + col_types_vec.push_back(ai->type); + } + llvm::Value *raw_col_type_bytes = codegen.ConstGenericBytes( + col_types_vec.data(), static_cast(col_types_vec.capacity()), + "colTypes"); + llvm::Value *output_col_types = codegen->CreatePointerCast( + raw_col_type_bytes, TypeProxy::GetType(codegen)->getPointerTo()); + + // Now create a pointer to the consumer function + using ConsumerFuncType = void (*)(void *); + llvm::Value *consumer_func = codegen->CreatePointerCast( + consumer_func_, proxy::TypeBuilder::GetType(codegen)); + + // Cast the runtime type to an opaque void*. This is because we're calling + // into pre-compiled C++ that doesn't know that the dynamically generated + // RuntimeState* looks like. + llvm::Value *runtime_state_ptr = codegen->CreatePointerCast( + codegen.GetState(), codegen.VoidType()->getPointerTo()); - // Call + // Call CSVScanner::Init() codegen.Call(CSVScannerProxy::Init, {scanner_ptr, exec_ctx_ptr, file_path, output_col_types, - num_output_cols, consumer_func, runtime_state_ptr, + codegen.Const32(num_cols), consumer_func, runtime_state_ptr, codegen.Const8(scan_.GetDelimiterChar()), codegen.Const8(scan_.GetQuoteChar()), codegen.Const8(scan_.GetEscapeChar())}); } +namespace { + +class CSVColumnAccess : public RowBatch::AttributeAccess { + public: + CSVColumnAccess(const planner::AttributeInfo *ai, llvm::Value *csv_columns, + const std::string &null_str, llvm::Value *runtime_null_str) + : ai_(ai), + csv_columns_(csv_columns), + null_str_(null_str), + runtime_null_(runtime_null_str) {} + + llvm::Value *Columns() const { return csv_columns_; } + + uint32_t ColumnIndex() const { return ai_->attribute_id; } + + bool IsNullable() const { return ai_->type.nullable; } + + const type::SqlType &SqlType() const { return ai_->type.GetSqlType(); } + + llvm::Value *IsNull(CodeGen &codegen, llvm::Value *data_ptr, + llvm::Value *data_len) const { + uint32_t null_str_len = static_cast(null_str_.length()); + + // Is the length of the column value the same as the NULL string? + llvm::Value *eq_len = + codegen->CreateICmpEQ(data_len, codegen.Const32(null_str_len)); + + // If the null string is empty, generate simple comparison + if (null_str_len == 0) { + return eq_len; + } + + llvm::Value *cmp_res; + lang::If check_null{codegen, eq_len}; + { + // Do a memcmp against the NULL string + cmp_res = codegen.Memcmp(data_ptr, runtime_null_, + codegen.Const64(null_str_.length())); + cmp_res = codegen->CreateICmpEQ(cmp_res, codegen.Const32(0)); + } + check_null.EndIf(); + return check_null.BuildPHI(cmp_res, codegen.ConstBool(false)); + } + + Value LoadValueIgnoreNull(CodeGen &codegen, llvm::Value *type, + llvm::Value *data_ptr, + llvm::Value *data_len) const { + auto *input_func = SqlType().GetInputFunction(codegen, ai_->type); + auto *raw_val = codegen.CallFunc(input_func, {type, data_ptr, data_len}); + return codegen::Value{ai_->type, raw_val, nullptr, + codegen.ConstBool(false)}; + } + + Value Access(CodeGen &codegen, UNUSED_ATTRIBUTE RowBatch::Row &row) override { + // Load the type, data pointer and length values for the column + auto *type = codegen->CreateConstInBoundsGEP2_32( + CSVScannerColumnProxy::GetType(codegen), Columns(), ColumnIndex(), 0); + auto *data_ptr = codegen->CreateLoad(codegen->CreateConstInBoundsGEP2_32( + CSVScannerColumnProxy::GetType(codegen), Columns(), ColumnIndex(), 1)); + auto *data_len = codegen->CreateLoad(codegen->CreateConstInBoundsGEP2_32( + CSVScannerColumnProxy::GetType(codegen), Columns(), ColumnIndex(), 2)); + + // If the valid isn't NULLable, avoid the null check here + if (!IsNullable()) { + return LoadValueIgnoreNull(codegen, type, data_ptr, data_len); + } + + // If the value isn't actually null, try to parse it + codegen::Value valid_val, null_val; + lang::If is_null{codegen, + codegen->CreateNot(IsNull(codegen, data_ptr, data_len))}; + { + // Load valid + valid_val = LoadValueIgnoreNull(codegen, type, data_ptr, data_len); + } + is_null.ElseBlock(); + { + // Default null + null_val = SqlType().GetNullValue(codegen); + } + is_null.EndIf(); + + // Return + return is_null.BuildPHI(valid_val, null_val); + } + + private: + const planner::AttributeInfo *ai_; + llvm::Value *csv_columns_; + const std::string &null_str_; + llvm::Value *runtime_null_; +}; + +} // namespace + void CSVScanTranslator::DefineAuxiliaryFunctions() { - // Define consumer function here CodeGen &codegen = GetCodeGen(); CompilationContext &cc = GetCompilationContext(); + // Define consumer function here std::vector arg_types = { {"runtimeState", cc.GetRuntimeState().FinalizeType(codegen)->getPointerTo()}}; - codegen::FunctionDeclaration decl{codegen.GetCodeContext(), "consumer", - FunctionDeclaration::Visibility::Internal, - codegen.VoidType(), arg_types}; - codegen::FunctionBuilder scan_consumer{codegen.GetCodeContext(), decl}; + FunctionDeclaration decl{codegen.GetCodeContext(), "consumer", + FunctionDeclaration::Visibility::Internal, + codegen.VoidType(), arg_types}; + FunctionBuilder scan_consumer{codegen.GetCodeContext(), decl}; { ConsumerContext ctx{cc, GetPipeline()}; Vector v{nullptr, 1, nullptr}; RowBatch one{GetCompilationContext(), codegen.Const32(0), codegen.Const32(1), v, false}; - RowBatch::Row row{one, nullptr, nullptr}; - - // Get the attributes - std::vector output_attributes; - scan_.GetAttributes(output_attributes); // Load the pointer to the columns view - auto *cols = codegen->CreateLoad(codegen->CreateConstInBoundsGEP2_32( + llvm::Value *cols = codegen->CreateLoad(codegen->CreateConstInBoundsGEP2_32( CSVScannerProxy::GetType(codegen), LoadStatePtr(scanner_id_), 0, 1)); - // For each column, call the type's input function to read the input value - for (uint32_t i = 0; i < output_attributes.size(); i++) { - const auto *output_ai = output_attributes[i]; - - const auto &sql_type = output_ai->type.GetSqlType(); - - auto *is_null = codegen->CreateConstInBoundsGEP2_32( - CSVScannerColumnProxy::GetType(codegen), cols, i, 3); - - codegen::Value val, null_val; - lang::If not_null{codegen, - codegen->CreateNot(codegen->CreateLoad(is_null))}; - { - // Grab a pointer to the ptr and length - auto *type = codegen->CreatePointerCast( - codegen.ConstType(output_ai->type), - TypeProxy::GetType(codegen)->getPointerTo()); - auto *ptr = codegen->CreateLoad(codegen->CreateConstInBoundsGEP2_32( - CSVScannerColumnProxy::GetType(codegen), cols, i, 1)); - auto *len = codegen->CreateLoad(codegen->CreateConstInBoundsGEP2_32( - CSVScannerColumnProxy::GetType(codegen), cols, i, 2)); - - // Invoke the input function - auto *input_func = sql_type.GetInputFunction(codegen, output_ai->type); - auto *raw_val = codegen.CallFunc(input_func, {type, ptr, len}); - - // Non-null value - val = codegen::Value{output_ai->type, raw_val, nullptr, - codegen.ConstBool(false)}; - } - not_null.ElseBlock(); - { - // Null value - null_val = sql_type.GetNullValue(codegen); - } - not_null.EndIf(); - - codegen::Value final_val = not_null.BuildPHI(val, null_val); - row.RegisterAttributeValue(output_ai, final_val); + llvm::Value *null_str = codegen.ConstString(scan_.GetNullString(), "null"); + + // Add accessors for all columns into the row batch + std::vector column_accessors; + for (uint32_t i = 0; i < output_attributes_.size(); i++) { + column_accessors.emplace_back(output_attributes_[i], cols, + scan_.GetNullString(), null_str); + } + for (uint32_t i = 0; i < output_attributes_.size(); i++) { + one.AddAttribute(output_attributes_[i], &column_accessors[i]); } + // Push the row through the pipeline + RowBatch::Row row{one, nullptr, nullptr}; ctx.Consume(row); + + // Done scan_consumer.ReturnAndFinish(); } + + // The consumer function has been generated. Get a pointer to it now. consumer_func_ = scan_consumer.GetFunction(); } @@ -148,27 +236,10 @@ void CSVScanTranslator::TearDownState() { } std::string CSVScanTranslator::GetName() const { - return std::__cxx11::string(); -} - -llvm::Value *CSVScanTranslator::ConstructColumnDescriptor() const { - // First, we pull out all the attributes produced by the scan, in order - std::vector cols; - scan_.GetAttributes(cols); - - // But, what we really need are just the column types, so pull those out now - std::vector col_types_vec; - for (const auto *col : cols) { - col_types_vec.push_back(col->type); - } - - CodeGen &codegen = GetCodeGen(); - - auto num_bytes = cols.size() * sizeof(decltype(col_types_vec)::value_type); - auto *bytes = codegen.ConstGenericBytes( - col_types_vec.data(), static_cast(num_bytes), "colTypes"); - return codegen->CreatePointerCast( - bytes, TypeProxy::GetType(codegen)->getPointerTo()); + return StringUtil::Format( + "CSVScan(file: '%s', delimiter: '%c', quote: '%c', escape: '%c')", + scan_.GetFileName().c_str(), scan_.GetDelimiterChar(), + scan_.GetQuoteChar(), scan_.GetEscapeChar()); } } // namespace codegen diff --git a/src/codegen/values_runtime.cpp b/src/codegen/values_runtime.cpp index dddc0a43ac6..9796b5457f6 100644 --- a/src/codegen/values_runtime.cpp +++ b/src/codegen/values_runtime.cpp @@ -139,6 +139,11 @@ void TrimLeftRight(const char *&left, const char *&right) { template typename std::enable_if::value, T>::type ToNum( const char *ptr, uint32_t len) { + if (len == 0) { + RuntimeFunctions::ThrowInvalidInputStringException(); + __builtin_unreachable(); + } + const char *start = ptr; const char *end = start + len; @@ -159,6 +164,7 @@ typename std::enable_if::value, T>::type ToNum( while (start != end) { if (*start < '0' || *start > '9') { RuntimeFunctions::ThrowInvalidInputStringException(); + __builtin_unreachable(); } num = (num * 10) + (*start - '0'); @@ -175,6 +181,7 @@ typename std::enable_if::value, T>::type ToNum( if (num <= std::numeric_limits::min() || num >= std::numeric_limits::max()) { RuntimeFunctions::ThrowOverflowException(); + __builtin_unreachable(); } // Done @@ -186,7 +193,11 @@ typename std::enable_if::value, T>::type ToNum( bool ValuesRuntime::InputBoolean(UNUSED_ATTRIBUTE const type::Type &type, const char *ptr, uint32_t len) { PELOTON_ASSERT(ptr != nullptr && "Input is assumed to be non-NULL"); - PELOTON_ASSERT(len != 0 && "Length must be non-zero"); + + if (len == 0) { + RuntimeFunctions::ThrowInvalidInputStringException(); + __builtin_unreachable(); + } const char *start = ptr, *end = ptr + len; @@ -201,7 +212,8 @@ bool ValuesRuntime::InputBoolean(UNUSED_ATTRIBUTE const type::Type &type, case 't': case 'T': { static constexpr char kTrue[] = "true"; - if (strncasecmp(start, kTrue, std::min(trimmed_len, sizeof(kTrue)))) { + std::cout << sizeof(kTrue) << std::endl; + if (strncasecmp(start, kTrue, trimmed_len) == 0) { return true; } break; @@ -209,7 +221,7 @@ bool ValuesRuntime::InputBoolean(UNUSED_ATTRIBUTE const type::Type &type, case 'f': case 'F': { static constexpr char kFalse[] = "false"; - if (strncasecmp(start, kFalse, std::min(trimmed_len, sizeof(kFalse)))) { + if (strncasecmp(start, kFalse, trimmed_len) == 0) { return false; } break; @@ -217,7 +229,7 @@ bool ValuesRuntime::InputBoolean(UNUSED_ATTRIBUTE const type::Type &type, case 'y': case 'Y': { static constexpr char kYes[] = "yes"; - if (strncasecmp(start, kYes, std::min(trimmed_len, sizeof(kYes)))) { + if (strncasecmp(start, kYes, trimmed_len) == 0) { return true; } break; @@ -225,7 +237,7 @@ bool ValuesRuntime::InputBoolean(UNUSED_ATTRIBUTE const type::Type &type, case 'n': case 'N': { static constexpr char kNo[] = "no"; - if (strncasecmp(start, kNo, std::min(trimmed_len, sizeof(kNo)))) { + if (strncasecmp(start, kNo, trimmed_len) == 0) { return false; } break; @@ -235,9 +247,10 @@ bool ValuesRuntime::InputBoolean(UNUSED_ATTRIBUTE const type::Type &type, // 'o' not enough to distinguish between on/off static constexpr char kOff[] = "off"; static constexpr char kOn[] = "on"; - if (strncasecmp(start, kOff, std::min(trimmed_len, sizeof(kOff)))) { + if (strncasecmp(start, kOff, (trimmed_len > 3 ? trimmed_len : 3)) == 0) { return false; - } else if (strncasecmp(start, kOn, std::min(trimmed_len, sizeof(kOn)))) { + } else if (strncasecmp(start, kOn, (trimmed_len > 2 ? trimmed_len : 2)) == + 0) { return true; } break; @@ -267,28 +280,24 @@ bool ValuesRuntime::InputBoolean(UNUSED_ATTRIBUTE const type::Type &type, int8_t ValuesRuntime::InputTinyInt(UNUSED_ATTRIBUTE const type::Type &type, const char *ptr, uint32_t len) { PELOTON_ASSERT(ptr != nullptr && "Input is assumed to be non-NULL"); - PELOTON_ASSERT(len != 0 && "Length must be non-zero"); return ToNum(ptr, len); } int16_t ValuesRuntime::InputSmallInt(UNUSED_ATTRIBUTE const type::Type &type, const char *ptr, uint32_t len) { PELOTON_ASSERT(ptr != nullptr && "Input is assumed to be non-NULL"); - PELOTON_ASSERT(len != 0 && "Length must be non-zero"); return ToNum(ptr, len); } int32_t ValuesRuntime::InputInteger(UNUSED_ATTRIBUTE const type::Type &type, const char *ptr, uint32_t len) { PELOTON_ASSERT(ptr != nullptr && "Input is assumed to be non-NULL"); - PELOTON_ASSERT(len != 0 && "Length must be non-zero"); return ToNum(ptr, len); } int64_t ValuesRuntime::InputBigInt(UNUSED_ATTRIBUTE const type::Type &type, const char *ptr, uint32_t len) { PELOTON_ASSERT(ptr != nullptr && "Input is assumed to be non-NULL"); - PELOTON_ASSERT(len != 0 && "Length must be non-zero"); return ToNum(ptr, len); } diff --git a/src/include/codegen/operator/csv_scan_translator.h b/src/include/codegen/operator/csv_scan_translator.h index 12e132ab4ce..3389e1e5c09 100644 --- a/src/include/codegen/operator/csv_scan_translator.h +++ b/src/include/codegen/operator/csv_scan_translator.h @@ -54,15 +54,12 @@ class CSVScanTranslator : public OperatorTranslator { std::string GetName() const override; private: - // Plan accessor - const planner::CSVScanPlan &GetScanPlan() const { return scan_; } - - llvm::Value *ConstructColumnDescriptor() const; - - private: - // The scan + // The plan const planner::CSVScanPlan &scan_; + // The set of attributes output by the csv scan + std::vector output_attributes_; + // The scanner state ID RuntimeState::StateID scanner_id_; diff --git a/src/include/codegen/proxy/csv_scanner_proxy.h b/src/include/codegen/proxy/csv_scanner_proxy.h index ae9b13cd7ec..ee27ce2b003 100644 --- a/src/include/codegen/proxy/csv_scanner_proxy.h +++ b/src/include/codegen/proxy/csv_scanner_proxy.h @@ -15,12 +15,13 @@ #include "codegen/proxy/proxy.h" #include "codegen/proxy/type_builder.h" #include "codegen/util/csv_scanner.h" +#include "codegen/proxy/runtime_functions_proxy.h" namespace peloton { namespace codegen { PROXY(CSVScannerColumn) { - DECLARE_MEMBER(0, char[sizeof(type::Type)], type); + DECLARE_MEMBER(0, type::Type, type); DECLARE_MEMBER(1, char *, ptr); DECLARE_MEMBER(2, uint32_t, len); DECLARE_MEMBER(3, bool, is_null); diff --git a/src/include/planner/csv_scan_plan.h b/src/include/planner/csv_scan_plan.h index f9611b22630..2f40999efd0 100644 --- a/src/include/planner/csv_scan_plan.h +++ b/src/include/planner/csv_scan_plan.h @@ -34,9 +34,14 @@ class CSVScanPlan : public AbstractScan { * * @param file_name The file path * @param cols Information of the columns expected in each row of the CSV + * @param delimiter The character that separates columns within a row + * @param quote The character used to quote data (i.e., strings) + * @param escape The character that should appear before any data characters + * that match the quote character. */ CSVScanPlan(std::string file_name, std::vector &&cols, - char delimiter = ',', char quote = '"', char escape = '"'); + char delimiter = ',', char quote = '"', char escape = '"', + std::string null = ""); ////////////////////////////////////////////////////////////////////////////// /// @@ -55,6 +60,7 @@ class CSVScanPlan : public AbstractScan { char GetDelimiterChar() const { return delimiter_; } char GetQuoteChar() const { return quote_; } char GetEscapeChar() const { return escape_; } + const std::string &GetNullString() const { return null_; } ////////////////////////////////////////////////////////////////////////////// /// @@ -76,8 +82,9 @@ class CSVScanPlan : public AbstractScan { char delimiter_; char quote_; char escape_; + const std::string null_; - std::vector> attributes_; + std::vector attributes_; }; //////////////////////////////////////////////////////////////////////////////// @@ -88,17 +95,19 @@ class CSVScanPlan : public AbstractScan { inline CSVScanPlan::CSVScanPlan(std::string file_name, std::vector &&cols, - char delimiter, char quote, char escape) + char delimiter, char quote, char escape, + std::string null) : file_name_(std::move(file_name)), delimiter_(delimiter), quote_(quote), - escape_(escape) { - for (const auto &col : cols) { - std::unique_ptr attribute{ - new planner::AttributeInfo()}; - attribute->name = col.name; - attribute->type = codegen::type::Type{col.type, true}; - attributes_.emplace_back(std::move(attribute)); + escape_(escape), + null_(null) { + attributes_.resize(cols.size()); + for (uint32_t i = 0; i < cols.size(); i++) { + const auto &col_info = cols[i]; + attributes_[i].type = codegen::type::Type{col_info.type, true}; + attributes_[i].attribute_id = i; + attributes_[i].name = col_info.name; } } @@ -109,8 +118,8 @@ inline PlanNodeType CSVScanPlan::GetPlanNodeType() const { inline std::unique_ptr CSVScanPlan::Copy() const { std::vector new_cols; for (const auto &attribute : attributes_) { - new_cols.push_back(CSVScanPlan::ColumnInfo{ - .name = attribute->name, .type = attribute->type.type_id}); + new_cols.push_back(CSVScanPlan::ColumnInfo{.name = attribute.name, + .type = attribute.type.type_id}); } return std::unique_ptr( new CSVScanPlan(file_name_, std::move(new_cols))); @@ -118,7 +127,7 @@ inline std::unique_ptr CSVScanPlan::Copy() const { inline void CSVScanPlan::PerformBinding(BindingContext &binding_context) { for (uint32_t i = 0; i < attributes_.size(); i++) { - binding_context.BindNew(i, attributes_[i].get()); + binding_context.BindNew(i, &attributes_[i]); } } @@ -142,7 +151,7 @@ inline void CSVScanPlan::GetAttributes( std::vector &ais) const { ais.clear(); for (const auto &ai : attributes_) { - ais.push_back(ai.get()); + ais.push_back(&ai); } } From 66bb521b76df43db6757c0c4267718052cf76f1c Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Wed, 9 May 2018 02:25:20 -0400 Subject: [PATCH 21/42] Moved TupleRuntime::CreateVarlen() into ValuesRuntime::WriteVarlen(). Better code organization and clearer name. --- src/codegen/operator/csv_scan_translator.cpp | 6 ++-- src/codegen/proxy/tuple_runtime_proxy.cpp | 24 ------------- src/codegen/proxy/values_runtime_proxy.cpp | 5 ++- src/codegen/table_storage.cpp | 6 ++-- src/codegen/tuple_runtime.cpp | 35 ------------------- src/codegen/values_runtime.cpp | 19 ++++++++++ .../codegen/proxy/tuple_runtime_proxy.h | 25 ------------- .../codegen/proxy/values_runtime_proxy.h | 4 ++- src/include/codegen/tuple_runtime.h | 32 ----------------- src/include/codegen/values_runtime.h | 16 +++++++++ 10 files changed, 48 insertions(+), 124 deletions(-) delete mode 100644 src/codegen/proxy/tuple_runtime_proxy.cpp delete mode 100644 src/codegen/tuple_runtime.cpp delete mode 100644 src/include/codegen/proxy/tuple_runtime_proxy.h delete mode 100644 src/include/codegen/tuple_runtime.h diff --git a/src/codegen/operator/csv_scan_translator.cpp b/src/codegen/operator/csv_scan_translator.cpp index d1c191a9ccd..8603a043e89 100644 --- a/src/codegen/operator/csv_scan_translator.cpp +++ b/src/codegen/operator/csv_scan_translator.cpp @@ -88,10 +88,10 @@ namespace { class CSVColumnAccess : public RowBatch::AttributeAccess { public: CSVColumnAccess(const planner::AttributeInfo *ai, llvm::Value *csv_columns, - const std::string &null_str, llvm::Value *runtime_null_str) + std::string null_str, llvm::Value *runtime_null_str) : ai_(ai), csv_columns_(csv_columns), - null_str_(null_str), + null_str_(std::move(null_str)), runtime_null_(runtime_null_str) {} llvm::Value *Columns() const { return csv_columns_; } @@ -172,7 +172,7 @@ class CSVColumnAccess : public RowBatch::AttributeAccess { private: const planner::AttributeInfo *ai_; llvm::Value *csv_columns_; - const std::string &null_str_; + const std::string null_str_; llvm::Value *runtime_null_; }; diff --git a/src/codegen/proxy/tuple_runtime_proxy.cpp b/src/codegen/proxy/tuple_runtime_proxy.cpp deleted file mode 100644 index 128c938f522..00000000000 --- a/src/codegen/proxy/tuple_runtime_proxy.cpp +++ /dev/null @@ -1,24 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Peloton -// -// tuple_runtime_proxy.cpp -// -// Identification: src/codegen/proxy/tuple_runtime_proxy.cpp -// -// Copyright (c) 2015-2017, Carnegie Mellon University Database Group -// -//===----------------------------------------------------------------------===// - -#include "codegen/proxy/tuple_runtime_proxy.h" - -#include "codegen/tuple_runtime.h" -#include "codegen/proxy/pool_proxy.h" - -namespace peloton { -namespace codegen { - -DEFINE_METHOD(peloton::codegen, TupleRuntime, CreateVarlen); - -} // namespace codegen -} // namespace peloton diff --git a/src/codegen/proxy/values_runtime_proxy.cpp b/src/codegen/proxy/values_runtime_proxy.cpp index 37f90834362..1c32b6259b0 100644 --- a/src/codegen/proxy/values_runtime_proxy.cpp +++ b/src/codegen/proxy/values_runtime_proxy.cpp @@ -6,7 +6,7 @@ // // Identification: src/codegen/proxy/values_runtime_proxy.cpp // -// Copyright (c) 2015-2017, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -14,6 +14,7 @@ #include "codegen/proxy/value_proxy.h" #include "codegen/proxy/runtime_functions_proxy.h" +#include "codegen/proxy/pool_proxy.h" namespace peloton { namespace codegen { @@ -37,5 +38,7 @@ DEFINE_METHOD(peloton::codegen, ValuesRuntime, InputBigInt); DEFINE_METHOD(peloton::codegen, ValuesRuntime, CompareStrings); +DEFINE_METHOD(peloton::codegen, ValuesRuntime, WriteVarlen); + } // namespace codegen } // namespace peloton diff --git a/src/codegen/table_storage.cpp b/src/codegen/table_storage.cpp index 198c7df9f2a..99df998b644 100644 --- a/src/codegen/table_storage.cpp +++ b/src/codegen/table_storage.cpp @@ -6,7 +6,7 @@ // // Identification: src/codegen/table_storage.cpp // -// Copyright (c) 2015-17, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -14,7 +14,7 @@ #include "catalog/schema.h" #include "codegen/lang/if.h" -#include "codegen/proxy/tuple_runtime_proxy.h" +#include "codegen/proxy/values_runtime_proxy.h" #include "codegen/type/sql_type.h" #include "codegen/type/type.h" #include "codegen/value.h" @@ -49,7 +49,7 @@ void TableStorage::StoreValues(CodeGen &codegen, llvm::Value *tuple_ptr, } value_is_null.ElseBlock(); { - codegen.Call(TupleRuntimeProxy::CreateVarlen, + codegen.Call(ValuesRuntimeProxy::WriteVarlen, {value.GetValue(), value.GetLength(), val_ptr, pool}); } value_is_null.EndIf(); diff --git a/src/codegen/tuple_runtime.cpp b/src/codegen/tuple_runtime.cpp deleted file mode 100644 index d065feed5d9..00000000000 --- a/src/codegen/tuple_runtime.cpp +++ /dev/null @@ -1,35 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Peloton -// -// tuple_runtime.cpp -// -// Identification: src/codegen/tuple_runtime.cpp -// -// Copyright (c) 2015-17, Carnegie Mellon University Database Group -// -//===----------------------------------------------------------------------===// - -#include "codegen/tuple_runtime.h" -#include "type/abstract_pool.h" - -namespace peloton { -namespace codegen { - -void TupleRuntime::CreateVarlen(char *data, uint32_t len, char *buf, - peloton::type::AbstractPool *pool) { - struct varlen_t { - uint32_t len; - char data[0]; - }; - - auto *area = - reinterpret_cast(pool->Allocate(sizeof(uint32_t) + len)); - area->len = len; - PELOTON_MEMCPY(area->data, data, len); - - *reinterpret_cast(buf) = area; -} - -} // namespace codegen -} // namespace peloton diff --git a/src/codegen/values_runtime.cpp b/src/codegen/values_runtime.cpp index 9796b5457f6..2c2c771e845 100644 --- a/src/codegen/values_runtime.cpp +++ b/src/codegen/values_runtime.cpp @@ -16,6 +16,7 @@ #include "codegen/runtime_functions.h" #include "codegen/type/type.h" +#include "type/abstract_pool.h" #include "type/value.h" #include "type/type_util.h" #include "type/value_factory.h" @@ -306,5 +307,23 @@ int32_t ValuesRuntime::CompareStrings(const char *str1, uint32_t len1, return peloton::type::TypeUtil::CompareStrings(str1, len1, str2, len2); } +void ValuesRuntime::WriteVarlen(const char *data, uint32_t len, char *buf, + peloton::type::AbstractPool &pool) { + struct Varlen { + uint32_t len; + char data[0]; + }; + + // Allocate memory for the Varlen object + auto *area = static_cast(pool.Allocate(sizeof(uint32_t) + len)); + + // Populate it + area->len = len; + PELOTON_MEMCPY(area->data, data, len); + + // Store a pointer to the Varlen object into the target memory space + *reinterpret_cast(buf) = area; +} + } // namespace codegen } // namespace peloton diff --git a/src/include/codegen/proxy/tuple_runtime_proxy.h b/src/include/codegen/proxy/tuple_runtime_proxy.h deleted file mode 100644 index e166349575e..00000000000 --- a/src/include/codegen/proxy/tuple_runtime_proxy.h +++ /dev/null @@ -1,25 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Peloton -// -// tuple_runtime_proxy.h -// -// Identification: src/include/codegen/proxy/tuple_runtime_proxy.h -// -// Copyright (c) 2015-2018, Carnegie Mellon University Database Group -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include "codegen/proxy/proxy.h" - -namespace peloton { -namespace codegen { - -PROXY(TupleRuntime) { - DECLARE_METHOD(CreateVarlen); -}; - -} // namespace codegen -} // namespace peloton diff --git a/src/include/codegen/proxy/values_runtime_proxy.h b/src/include/codegen/proxy/values_runtime_proxy.h index 3fe57ab36fb..85d9d1cfb85 100644 --- a/src/include/codegen/proxy/values_runtime_proxy.h +++ b/src/include/codegen/proxy/values_runtime_proxy.h @@ -6,7 +6,7 @@ // // Identification: src/include/codegen/proxy/values_runtime_proxy.h // -// Copyright (c) 2015-2017, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -37,6 +37,8 @@ PROXY(ValuesRuntime) { DECLARE_METHOD(InputBigInt); DECLARE_METHOD(CompareStrings); + + DECLARE_METHOD(WriteVarlen); }; } // namespace codegen diff --git a/src/include/codegen/tuple_runtime.h b/src/include/codegen/tuple_runtime.h deleted file mode 100644 index 86532055c7a..00000000000 --- a/src/include/codegen/tuple_runtime.h +++ /dev/null @@ -1,32 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Peloton -// -// tuple_runtime.h -// -// Identification: src/include/codegen/tuple_runtime.h -// -// Copyright (c) 2015-17, Carnegie Mellon University Database Group -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include "common/internal_types.h" - -namespace peloton { - -namespace type { -class AbstractPool; -} // namespace type - -namespace codegen { - -class TupleRuntime { - public: - static void CreateVarlen(char *data, uint32_t len, char *buf, - peloton::type::AbstractPool *pool); -}; - -} // namespace codegen -} // namespace peloton diff --git a/src/include/codegen/values_runtime.h b/src/include/codegen/values_runtime.h index 206e9ed9bb2..e01b93c54f1 100644 --- a/src/include/codegen/values_runtime.h +++ b/src/include/codegen/values_runtime.h @@ -15,6 +15,11 @@ #include namespace peloton { + +namespace type { +class AbstractPool; +} // namespace type + namespace codegen { namespace type { @@ -93,6 +98,17 @@ class ValuesRuntime { */ static int32_t CompareStrings(const char *str1, uint32_t len1, const char *str2, uint32_t len2); + + /** + * Write the provided variable length object into the target buffer. + * + * @param data The bytes we wish to serialize + * @param len The length of the byte array + * @param buf The target position we wish to write to + * @param pool A memory pool to source memory from + */ + static void WriteVarlen(const char *data, uint32_t len, char *buf, + peloton::type::AbstractPool &pool); }; } // namespace codegen From e318f661f76ceab089a9ff55a626c54897f8e26e Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Mon, 14 May 2018 16:08:09 -0400 Subject: [PATCH 22/42] Added error handling for long columns. Added null-terminator byte for when read-buffers are copied to line-buffers. --- src/codegen/util/csv_scanner.cpp | 45 ++++++++++++++++++++------ src/include/codegen/util/csv_scanner.h | 7 ++-- 2 files changed, 40 insertions(+), 12 deletions(-) diff --git a/src/codegen/util/csv_scanner.cpp b/src/codegen/util/csv_scanner.cpp index a7140ad322d..da606fcaac3 100644 --- a/src/codegen/util/csv_scanner.cpp +++ b/src/codegen/util/csv_scanner.cpp @@ -37,6 +37,7 @@ CSVScanner::CSVScanner(peloton::type::AbstractPool &pool, line_(nullptr), line_len_(0), line_maxlen_(0), + line_number_(0), delimiter_(delimiter), quote_(quote), escape_(escape), @@ -110,15 +111,16 @@ void CSVScanner::Initialize() { file_.Open(file_path_, peloton::util::File::AccessMode::ReadOnly); // Allocate buffer space - buffer_ = static_cast(memory_.Allocate(kDefaultBufferSize + 1)); + buffer_ = static_cast(memory_.Allocate(kDefaultBufferSize)); // Fill read-buffer NextBuffer(); - // Allocate space for the full line, if it doesn't fit into the buffer + // Allocate space for the full line, if it doesn't fit into the buffer. We + // reserve the last byte for the null-byte terminator. line_ = static_cast(memory_.Allocate(kDefaultBufferSize)); line_len_ = 0; - line_maxlen_ = kDefaultBufferSize; + line_maxlen_ = kDefaultBufferSize - 1; } bool CSVScanner::NextBuffer() { @@ -139,12 +141,25 @@ void CSVScanner::AppendToCurrentLine(const char *data, uint32_t len) { } if (line_len_ + len > line_maxlen_) { + // Check if we can even allocate any more bytes + if (static_cast(len) > kMaxAllocSize - line_len_) { + const auto msg = StringUtil::Format( + "Line %u in file '%s' exceeds maximum line length: %lu", + line_number_ + 1, file_path_.c_str(), kMaxAllocSize); + throw Exception{msg}; + } + // The current line buffer isn't large enough to store the new bytes, so we - // need to resize it. By default, we double the capacity. - auto new_maxlen = line_maxlen_ * 2; + // need to resize it. Let's find an allocation size large enough to fit the + // new bytes. + uint32_t new_maxlen = line_maxlen_ * 2; while (new_maxlen < len) { new_maxlen *= 2; } + + // Clamp + new_maxlen = std::min(new_maxlen, static_cast(kMaxAllocSize)); + auto *new_line = static_cast(memory_.Allocate(new_maxlen)); // Copy the old data @@ -155,15 +170,14 @@ void CSVScanner::AppendToCurrentLine(const char *data, uint32_t len) { // Setup pointers and sizes line_ = new_line; - line_maxlen_ = new_maxlen; + line_maxlen_ = new_maxlen - 1; stats_.num_reallocs++; } - // At this point, we've guaranteed that the line is large enough to - // accommodate the new bytes, so let's go ahead and perform the copy. - + // Copy provided data into the line buffer, ensuring null-byte termination. PELOTON_MEMCPY(line_ + line_len_, data, len); + line_[line_len_ + len] = '\0'; // Increase the length of the line line_len_ += len; @@ -246,11 +260,22 @@ void CSVScanner::ProduceCSV(const char *line) { const auto *iter = line; for (uint32_t col_idx = 0; col_idx < num_cols_; col_idx++) { + // Start points to the beginning of the column's data value const char *start = iter; - for (; *iter != 0 && *iter != delimiter_; iter++) {} + + // Eat text until the next delimiter + while (*iter != 0 && *iter != delimiter_) { + iter++; + } + + // At this point, iter points to the end of the column's data value + + // Let's setup the columns cols_[col_idx].ptr = start; cols_[col_idx].len = static_cast(iter - start); cols_[col_idx].is_null = (cols_[col_idx].len == 0); + + // Eat delimiter, moving to next column iter++; } diff --git a/src/include/codegen/util/csv_scanner.h b/src/include/codegen/util/csv_scanner.h index d51475c4c43..a946dec903e 100644 --- a/src/include/codegen/util/csv_scanner.h +++ b/src/include/codegen/util/csv_scanner.h @@ -47,7 +47,10 @@ namespace util { class CSVScanner { public: // 64K buffer size - static constexpr uint32_t kDefaultBufferSize = (1ul << 16); + static constexpr uint32_t kDefaultBufferSize = (1ul << 16ul); + + // We allocate a maximum of 1GB for the line buffer + static constexpr uint64_t kMaxAllocSize = (1ul << 30ul); // The signature of the callback function using Callback = void (*)(void *); @@ -70,7 +73,7 @@ class CSVScanner { }; /** - * Various statistics tracked while we scan the CSV + * This structure tracks various statistics while we scan the CSV */ struct Stats { // The number of times the read-buffer was copied into the line-buffer From 447932ce8b168742da318f2bf0b194d0a89e574b Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Mon, 14 May 2018 16:14:48 -0400 Subject: [PATCH 23/42] Added inputs for decimal types --- src/codegen/proxy/values_runtime_proxy.cpp | 1 + src/codegen/type/decimal_type.cpp | 6 +-- src/codegen/values_runtime.cpp | 38 +++++++++++++++++++ .../codegen/proxy/values_runtime_proxy.h | 1 + src/include/codegen/values_runtime.h | 3 ++ 5 files changed, 46 insertions(+), 3 deletions(-) diff --git a/src/codegen/proxy/values_runtime_proxy.cpp b/src/codegen/proxy/values_runtime_proxy.cpp index 1c32b6259b0..530ad6b4e20 100644 --- a/src/codegen/proxy/values_runtime_proxy.cpp +++ b/src/codegen/proxy/values_runtime_proxy.cpp @@ -35,6 +35,7 @@ DEFINE_METHOD(peloton::codegen, ValuesRuntime, InputTinyInt); DEFINE_METHOD(peloton::codegen, ValuesRuntime, InputSmallInt); DEFINE_METHOD(peloton::codegen, ValuesRuntime, InputInteger); DEFINE_METHOD(peloton::codegen, ValuesRuntime, InputBigInt); +DEFINE_METHOD(peloton::codegen, ValuesRuntime, InputDecimal); DEFINE_METHOD(peloton::codegen, ValuesRuntime, CompareStrings); diff --git a/src/codegen/type/decimal_type.cpp b/src/codegen/type/decimal_type.cpp index 50a0b09e29c..92cc7ec5b6a 100644 --- a/src/codegen/type/decimal_type.cpp +++ b/src/codegen/type/decimal_type.cpp @@ -559,9 +559,9 @@ void Decimal::GetTypeForMaterialization(CodeGen &codegen, llvm::Type *&val_type, } llvm::Function *Decimal::GetInputFunction( - UNUSED_ATTRIBUTE CodeGen &codegen, - UNUSED_ATTRIBUTE const Type &type) const { - throw NotImplementedException{"Decimal inputs not implemented yet"}; + CodeGen &codegen, UNUSED_ATTRIBUTE const Type &type) const { + // TODO: We should be using the precision/scale in the output function + return ValuesRuntimeProxy::InputDecimal.GetFunction(codegen); } llvm::Function *Decimal::GetOutputFunction( diff --git a/src/codegen/values_runtime.cpp b/src/codegen/values_runtime.cpp index 2c2c771e845..e23e552813a 100644 --- a/src/codegen/values_runtime.cpp +++ b/src/codegen/values_runtime.cpp @@ -189,6 +189,32 @@ typename std::enable_if::value, T>::type ToNum( return static_cast(num); } +template +typename std::enable_if::value, T>::type ToNum( + const char *ptr, uint32_t len) { + if (len == 0) { + RuntimeFunctions::ThrowInvalidInputStringException(); + __builtin_unreachable(); + } + + // TODO(pmenon): Optimize me later + char *end = nullptr; + auto ret = std::strtod(ptr, &end); + + if (unlikely_branch(end == ptr)) { + if (errno == ERANGE) { + RuntimeFunctions::ThrowOverflowException(); + __builtin_unreachable(); + } else { + RuntimeFunctions::ThrowInvalidInputStringException(); + __builtin_unreachable(); + } + } + + // Done + return static_cast(ret); +} + } // namespace bool ValuesRuntime::InputBoolean(UNUSED_ATTRIBUTE const type::Type &type, @@ -302,6 +328,18 @@ int64_t ValuesRuntime::InputBigInt(UNUSED_ATTRIBUTE const type::Type &type, return ToNum(ptr, len); } +double ValuesRuntime::InputDecimal(UNUSED_ATTRIBUTE const type::Type &type, + const char *ptr, uint32_t len) { + PELOTON_ASSERT(ptr != nullptr && "Input is assumed to be non-NULL"); + return ToNum(ptr, len); +} + +//////////////////////////////////////////////////////////////////////////////// +/// +/// String comparison +/// +//////////////////////////////////////////////////////////////////////////////// + int32_t ValuesRuntime::CompareStrings(const char *str1, uint32_t len1, const char *str2, uint32_t len2) { return peloton::type::TypeUtil::CompareStrings(str1, len1, str2, len2); diff --git a/src/include/codegen/proxy/values_runtime_proxy.h b/src/include/codegen/proxy/values_runtime_proxy.h index 85d9d1cfb85..059f700d8c6 100644 --- a/src/include/codegen/proxy/values_runtime_proxy.h +++ b/src/include/codegen/proxy/values_runtime_proxy.h @@ -35,6 +35,7 @@ PROXY(ValuesRuntime) { DECLARE_METHOD(InputSmallInt); DECLARE_METHOD(InputInteger); DECLARE_METHOD(InputBigInt); + DECLARE_METHOD(InputDecimal); DECLARE_METHOD(CompareStrings); diff --git a/src/include/codegen/values_runtime.h b/src/include/codegen/values_runtime.h index e01b93c54f1..905ead1fd68 100644 --- a/src/include/codegen/values_runtime.h +++ b/src/include/codegen/values_runtime.h @@ -87,6 +87,9 @@ class ValuesRuntime { static int64_t InputBigInt(const type::Type &type, const char *ptr, uint32_t len); + static double InputDecimal(const type::Type &type, const char *ptr, + uint32_t len); + /** * Compare two strings, returning an integer value indicating their sort order * From d1e214a7bc4e0e0f01f17edf1624ac72735097b6 Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Tue, 15 May 2018 22:04:38 -0400 Subject: [PATCH 24/42] Moved type-specific functions into function namespace --- src/catalog/catalog.cpp | 54 +-- src/codegen/proxy/date_functions_proxy.cpp | 7 +- src/codegen/proxy/decimal_functions_proxy.cpp | 30 -- src/codegen/proxy/numeric_functions_proxy.cpp | 37 ++ src/codegen/proxy/string_functions_proxy.cpp | 10 +- src/codegen/proxy/values_runtime_proxy.cpp | 11 - src/codegen/table_storage.cpp | 4 +- src/codegen/type/bigint_type.cpp | 5 +- src/codegen/type/boolean_type.cpp | 5 +- src/codegen/type/date_type.cpp | 6 +- src/codegen/type/decimal_type.cpp | 12 +- src/codegen/type/integer_type.cpp | 5 +- src/codegen/type/smallint_type.cpp | 5 +- src/codegen/type/tinyint_type.cpp | 5 +- src/codegen/type/varbinary_type.cpp | 3 +- src/codegen/type/varchar_type.cpp | 2 +- src/codegen/values_runtime.cpp | 264 ----------- src/function/date_functions.cpp | 33 +- src/function/decimal_functions.cpp | 152 ------- src/function/numeric_functions.cpp | 417 ++++++++++++++++++ src/function/string_functions.cpp | 25 ++ .../codegen/proxy/date_functions_proxy.h | 7 +- ...ions_proxy.h => numeric_functions_proxy.h} | 19 +- .../codegen/proxy/string_functions_proxy.h | 2 + .../codegen/proxy/values_runtime_proxy.h | 11 - src/include/codegen/values_runtime.h | 53 --- src/include/common/container_tuple.h | 3 +- src/include/function/date_functions.h | 22 + src/include/function/decimal_functions.h | 46 -- src/include/function/numeric_functions.h | 81 ++++ src/include/function/string_functions.h | 30 ++ test/codegen/value_integrity_test.cpp | 11 +- test/function/decimal_functions_test.cpp | 46 +- 33 files changed, 748 insertions(+), 675 deletions(-) delete mode 100644 src/codegen/proxy/decimal_functions_proxy.cpp create mode 100644 src/codegen/proxy/numeric_functions_proxy.cpp delete mode 100644 src/function/decimal_functions.cpp create mode 100644 src/function/numeric_functions.cpp rename src/include/codegen/proxy/{decimal_functions_proxy.h => numeric_functions_proxy.h} (52%) delete mode 100644 src/include/function/decimal_functions.h create mode 100644 src/include/function/numeric_functions.h diff --git a/src/catalog/catalog.cpp b/src/catalog/catalog.cpp index 0759da7d42f..adc4f77b66e 100644 --- a/src/catalog/catalog.cpp +++ b/src/catalog/catalog.cpp @@ -30,7 +30,7 @@ #include "codegen/code_context.h" #include "concurrency/transaction_manager_factory.h" #include "function/date_functions.h" -#include "function/decimal_functions.h" +#include "function/numeric_functions.h" #include "function/old_engine_string_functions.h" #include "function/timestamp_functions.h" #include "index/index_factory.h" @@ -1283,43 +1283,43 @@ void Catalog::InitializeFunctions() { AddBuiltinFunction("abs", {type::TypeId::DECIMAL}, type::TypeId::DECIMAL, internal_lang, "Abs", function::BuiltInFuncType{ - OperatorId::Abs, function::DecimalFunctions::_Abs}, + OperatorId::Abs, function::NumericFunctions::_Abs}, txn); AddBuiltinFunction( "sqrt", {type::TypeId::TINYINT}, type::TypeId::DECIMAL, internal_lang, "Sqrt", function::BuiltInFuncType{OperatorId::Sqrt, - function::DecimalFunctions::Sqrt}, + function::NumericFunctions::Sqrt}, txn); AddBuiltinFunction( "sqrt", {type::TypeId::SMALLINT}, type::TypeId::DECIMAL, internal_lang, "Sqrt", function::BuiltInFuncType{OperatorId::Sqrt, - function::DecimalFunctions::Sqrt}, + function::NumericFunctions::Sqrt}, txn); AddBuiltinFunction( "sqrt", {type::TypeId::INTEGER}, type::TypeId::DECIMAL, internal_lang, "Sqrt", function::BuiltInFuncType{OperatorId::Sqrt, - function::DecimalFunctions::Sqrt}, + function::NumericFunctions::Sqrt}, txn); AddBuiltinFunction( "sqrt", {type::TypeId::BIGINT}, type::TypeId::DECIMAL, internal_lang, "Sqrt", function::BuiltInFuncType{OperatorId::Sqrt, - function::DecimalFunctions::Sqrt}, + function::NumericFunctions::Sqrt}, txn); AddBuiltinFunction( "sqrt", {type::TypeId::DECIMAL}, type::TypeId::DECIMAL, internal_lang, "Sqrt", function::BuiltInFuncType{OperatorId::Sqrt, - function::DecimalFunctions::Sqrt}, + function::NumericFunctions::Sqrt}, txn); AddBuiltinFunction( "floor", {type::TypeId::DECIMAL}, type::TypeId::DECIMAL, internal_lang, "Floor", function::BuiltInFuncType{OperatorId::Floor, - function::DecimalFunctions::_Floor}, + function::NumericFunctions::_Floor}, txn); /** @@ -1328,126 +1328,126 @@ void Catalog::InitializeFunctions() { AddBuiltinFunction("abs", {type::TypeId::TINYINT}, type::TypeId::TINYINT, internal_lang, "Abs", function::BuiltInFuncType{ - OperatorId::Abs, function::DecimalFunctions::_Abs}, + OperatorId::Abs, function::NumericFunctions::_Abs}, txn); AddBuiltinFunction("abs", {type::TypeId::SMALLINT}, type::TypeId::SMALLINT, internal_lang, "Abs", function::BuiltInFuncType{ - OperatorId::Abs, function::DecimalFunctions::_Abs}, + OperatorId::Abs, function::NumericFunctions::_Abs}, txn); AddBuiltinFunction("abs", {type::TypeId::INTEGER}, type::TypeId::INTEGER, internal_lang, "Abs", function::BuiltInFuncType{ - OperatorId::Abs, function::DecimalFunctions::_Abs}, + OperatorId::Abs, function::NumericFunctions::_Abs}, txn); AddBuiltinFunction("abs", {type::TypeId::BIGINT}, type::TypeId::BIGINT, internal_lang, "Abs", function::BuiltInFuncType{ - OperatorId::Abs, function::DecimalFunctions::_Abs}, + OperatorId::Abs, function::NumericFunctions::_Abs}, txn); AddBuiltinFunction( "floor", {type::TypeId::INTEGER}, type::TypeId::DECIMAL, internal_lang, "Floor", function::BuiltInFuncType{OperatorId::Floor, - function::DecimalFunctions::_Floor}, + function::NumericFunctions::_Floor}, txn); AddBuiltinFunction( "floor", {type::TypeId::BIGINT}, type::TypeId::DECIMAL, internal_lang, "Floor", function::BuiltInFuncType{OperatorId::Floor, - function::DecimalFunctions::_Floor}, + function::NumericFunctions::_Floor}, txn); AddBuiltinFunction( "floor", {type::TypeId::TINYINT}, type::TypeId::DECIMAL, internal_lang, "Floor", function::BuiltInFuncType{OperatorId::Floor, - function::DecimalFunctions::_Floor}, + function::NumericFunctions::_Floor}, txn); AddBuiltinFunction( "floor", {type::TypeId::SMALLINT}, type::TypeId::DECIMAL, internal_lang, "Floor", function::BuiltInFuncType{OperatorId::Floor, - function::DecimalFunctions::_Floor}, + function::NumericFunctions::_Floor}, txn); AddBuiltinFunction( "round", {type::TypeId::DECIMAL}, type::TypeId::DECIMAL, internal_lang, "Round", function::BuiltInFuncType{OperatorId::Round, - function::DecimalFunctions::_Round}, + function::NumericFunctions::_Round}, txn); AddBuiltinFunction( "ceil", {type::TypeId::DECIMAL}, type::TypeId::DECIMAL, internal_lang, "Ceil", function::BuiltInFuncType{OperatorId::Ceil, - function::DecimalFunctions::_Ceil}, + function::NumericFunctions::_Ceil}, txn); AddBuiltinFunction( "ceil", {type::TypeId::TINYINT}, type::TypeId::DECIMAL, internal_lang, "Ceil", function::BuiltInFuncType{OperatorId::Ceil, - function::DecimalFunctions::_Ceil}, + function::NumericFunctions::_Ceil}, txn); AddBuiltinFunction( "ceil", {type::TypeId::SMALLINT}, type::TypeId::DECIMAL, internal_lang, "Ceil", function::BuiltInFuncType{OperatorId::Ceil, - function::DecimalFunctions::_Ceil}, + function::NumericFunctions::_Ceil}, txn); AddBuiltinFunction( "ceil", {type::TypeId::INTEGER}, type::TypeId::DECIMAL, internal_lang, "Ceil", function::BuiltInFuncType{OperatorId::Ceil, - function::DecimalFunctions::_Ceil}, + function::NumericFunctions::_Ceil}, txn); AddBuiltinFunction( "ceil", {type::TypeId::BIGINT}, type::TypeId::DECIMAL, internal_lang, "Ceil", function::BuiltInFuncType{OperatorId::Ceil, - function::DecimalFunctions::_Ceil}, + function::NumericFunctions::_Ceil}, txn); AddBuiltinFunction( "ceiling", {type::TypeId::DECIMAL}, type::TypeId::DECIMAL, internal_lang, "Ceil", function::BuiltInFuncType{OperatorId::Ceil, - function::DecimalFunctions::_Ceil}, + function::NumericFunctions::_Ceil}, txn); AddBuiltinFunction( "ceiling", {type::TypeId::TINYINT}, type::TypeId::DECIMAL, internal_lang, "Ceil", function::BuiltInFuncType{OperatorId::Ceil, - function::DecimalFunctions::_Ceil}, + function::NumericFunctions::_Ceil}, txn); AddBuiltinFunction( "ceiling", {type::TypeId::SMALLINT}, type::TypeId::DECIMAL, internal_lang, "Ceil", function::BuiltInFuncType{OperatorId::Ceil, - function::DecimalFunctions::_Ceil}, + function::NumericFunctions::_Ceil}, txn); AddBuiltinFunction( "ceiling", {type::TypeId::INTEGER}, type::TypeId::DECIMAL, internal_lang, "Ceil", function::BuiltInFuncType{OperatorId::Ceil, - function::DecimalFunctions::_Ceil}, + function::NumericFunctions::_Ceil}, txn); AddBuiltinFunction( "ceiling", {type::TypeId::BIGINT}, type::TypeId::DECIMAL, internal_lang, "Ceil", function::BuiltInFuncType{OperatorId::Ceil, - function::DecimalFunctions::_Ceil}, + function::NumericFunctions::_Ceil}, txn); /** diff --git a/src/codegen/proxy/date_functions_proxy.cpp b/src/codegen/proxy/date_functions_proxy.cpp index de8f030ef4f..7bce9276f56 100644 --- a/src/codegen/proxy/date_functions_proxy.cpp +++ b/src/codegen/proxy/date_functions_proxy.cpp @@ -6,19 +6,24 @@ // // Identification: src/codegen/proxy/date_functions_proxy.cpp // -// Copyright (c) 2015-2017, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// #include "codegen/proxy/date_functions_proxy.h" +#include "codegen/proxy/runtime_functions_proxy.h" #include "codegen/proxy/type_builder.h" #include "function/date_functions.h" namespace peloton { namespace codegen { +// Utility functions DEFINE_METHOD(peloton::function, DateFunctions, Now); +// Input functions +DEFINE_METHOD(peloton::function, DateFunctions, InputDate); + } // namespace codegen } // namespace peloton diff --git a/src/codegen/proxy/decimal_functions_proxy.cpp b/src/codegen/proxy/decimal_functions_proxy.cpp deleted file mode 100644 index 4cbc6d05640..00000000000 --- a/src/codegen/proxy/decimal_functions_proxy.cpp +++ /dev/null @@ -1,30 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Peloton -// -// decimal_functions_proxy.cpp -// -// Identification: src/codegen/proxy/decimal_functions_proxy.cpp -// -// Copyright (c) 2015-2018, Carnegie Mellon University Database Group -// -//===----------------------------------------------------------------------===// - -#include "codegen/proxy/decimal_functions_proxy.h" - -#include "codegen/proxy/type_builder.h" -#include "function/decimal_functions.h" - -namespace peloton { -namespace codegen { - -DEFINE_METHOD(peloton::function, DecimalFunctions, Abs); - -DEFINE_METHOD(peloton::function, DecimalFunctions, Floor); - -DEFINE_METHOD(peloton::function, DecimalFunctions, Round); - -DEFINE_METHOD(peloton::function, DecimalFunctions, Ceil); - -} // namespace codegen -} // namespace peloton diff --git a/src/codegen/proxy/numeric_functions_proxy.cpp b/src/codegen/proxy/numeric_functions_proxy.cpp new file mode 100644 index 00000000000..133917b668d --- /dev/null +++ b/src/codegen/proxy/numeric_functions_proxy.cpp @@ -0,0 +1,37 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// numeric_functions_proxy.cpp +// +// Identification: src/codegen/proxy/numeric_functions_proxy.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include "codegen/proxy/numeric_functions_proxy.h" + +#include "codegen/proxy/runtime_functions_proxy.h" +#include "codegen/proxy/type_builder.h" +#include "function/numeric_functions.h" + +namespace peloton { +namespace codegen { + +// Utility functions +DEFINE_METHOD(peloton::function, NumericFunctions, Abs); +DEFINE_METHOD(peloton::function, NumericFunctions, Floor); +DEFINE_METHOD(peloton::function, NumericFunctions, Round); +DEFINE_METHOD(peloton::function, NumericFunctions, Ceil); + +// Input functions +DEFINE_METHOD(peloton::function, NumericFunctions, InputBoolean); +DEFINE_METHOD(peloton::function, NumericFunctions, InputTinyInt); +DEFINE_METHOD(peloton::function, NumericFunctions, InputSmallInt); +DEFINE_METHOD(peloton::function, NumericFunctions, InputInteger); +DEFINE_METHOD(peloton::function, NumericFunctions, InputBigInt); +DEFINE_METHOD(peloton::function, NumericFunctions, InputDecimal); + +} // namespace codegen +} // namespace peloton diff --git a/src/codegen/proxy/string_functions_proxy.cpp b/src/codegen/proxy/string_functions_proxy.cpp index 32e25ccc0e1..db765480e9b 100644 --- a/src/codegen/proxy/string_functions_proxy.cpp +++ b/src/codegen/proxy/string_functions_proxy.cpp @@ -13,6 +13,7 @@ #include "codegen/proxy/string_functions_proxy.h" #include "codegen/proxy/executor_context_proxy.h" +#include "codegen/proxy/pool_proxy.h" namespace peloton { namespace codegen { @@ -20,18 +21,17 @@ namespace codegen { // StrWithLen struct DEFINE_TYPE(StrWithLen, "peloton::StrWithLen", str, length); -// String Function DEFINE_METHOD(peloton::function, StringFunctions, Ascii); DEFINE_METHOD(peloton::function, StringFunctions, Like); DEFINE_METHOD(peloton::function, StringFunctions, Length); -DEFINE_METHOD(peloton::function, StringFunctions, Substr); -DEFINE_METHOD(peloton::function, StringFunctions, Repeat); - -// Trim-related functions DEFINE_METHOD(peloton::function, StringFunctions, BTrim); DEFINE_METHOD(peloton::function, StringFunctions, Trim); DEFINE_METHOD(peloton::function, StringFunctions, LTrim); DEFINE_METHOD(peloton::function, StringFunctions, RTrim); +DEFINE_METHOD(peloton::function, StringFunctions, Substr); +DEFINE_METHOD(peloton::function, StringFunctions, Repeat); +DEFINE_METHOD(peloton::function, StringFunctions, CompareStrings); +DEFINE_METHOD(peloton::function, StringFunctions, WriteString); } // namespace codegen } // namespace peloton diff --git a/src/codegen/proxy/values_runtime_proxy.cpp b/src/codegen/proxy/values_runtime_proxy.cpp index 530ad6b4e20..0c30ef1d4ac 100644 --- a/src/codegen/proxy/values_runtime_proxy.cpp +++ b/src/codegen/proxy/values_runtime_proxy.cpp @@ -30,16 +30,5 @@ DEFINE_METHOD(peloton::codegen, ValuesRuntime, OutputDecimal); DEFINE_METHOD(peloton::codegen, ValuesRuntime, OutputVarchar); DEFINE_METHOD(peloton::codegen, ValuesRuntime, OutputVarbinary); -DEFINE_METHOD(peloton::codegen, ValuesRuntime, InputBoolean); -DEFINE_METHOD(peloton::codegen, ValuesRuntime, InputTinyInt); -DEFINE_METHOD(peloton::codegen, ValuesRuntime, InputSmallInt); -DEFINE_METHOD(peloton::codegen, ValuesRuntime, InputInteger); -DEFINE_METHOD(peloton::codegen, ValuesRuntime, InputBigInt); -DEFINE_METHOD(peloton::codegen, ValuesRuntime, InputDecimal); - -DEFINE_METHOD(peloton::codegen, ValuesRuntime, CompareStrings); - -DEFINE_METHOD(peloton::codegen, ValuesRuntime, WriteVarlen); - } // namespace codegen } // namespace peloton diff --git a/src/codegen/table_storage.cpp b/src/codegen/table_storage.cpp index 99df998b644..e4240c6f7bc 100644 --- a/src/codegen/table_storage.cpp +++ b/src/codegen/table_storage.cpp @@ -14,7 +14,7 @@ #include "catalog/schema.h" #include "codegen/lang/if.h" -#include "codegen/proxy/values_runtime_proxy.h" +#include "codegen/proxy/string_functions_proxy.h" #include "codegen/type/sql_type.h" #include "codegen/type/type.h" #include "codegen/value.h" @@ -49,7 +49,7 @@ void TableStorage::StoreValues(CodeGen &codegen, llvm::Value *tuple_ptr, } value_is_null.ElseBlock(); { - codegen.Call(ValuesRuntimeProxy::WriteVarlen, + codegen.Call(StringFunctionsProxy::WriteString, {value.GetValue(), value.GetLength(), val_ptr, pool}); } value_is_null.EndIf(); diff --git a/src/codegen/type/bigint_type.cpp b/src/codegen/type/bigint_type.cpp index 9332bc51fbc..45b43b3ad46 100644 --- a/src/codegen/type/bigint_type.cpp +++ b/src/codegen/type/bigint_type.cpp @@ -6,7 +6,7 @@ // // Identification: src/codegen/type/bigint_type.cpp // -// Copyright (c) 2015-2017, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -14,6 +14,7 @@ #include "codegen/lang/if.h" #include "codegen/value.h" +#include "codegen/proxy/numeric_functions_proxy.h" #include "codegen/proxy/values_runtime_proxy.h" #include "codegen/type/boolean_type.h" #include "codegen/type/decimal_type.h" @@ -597,7 +598,7 @@ void BigInt::GetTypeForMaterialization(CodeGen &codegen, llvm::Type *&val_type, llvm::Function *BigInt::GetInputFunction( CodeGen &codegen, UNUSED_ATTRIBUTE const Type &type) const { - return ValuesRuntimeProxy::InputBigInt.GetFunction(codegen); + return NumericFunctionsProxy::InputBigInt.GetFunction(codegen); } llvm::Function *BigInt::GetOutputFunction( diff --git a/src/codegen/type/boolean_type.cpp b/src/codegen/type/boolean_type.cpp index 5f7387ed9b4..37668c761da 100644 --- a/src/codegen/type/boolean_type.cpp +++ b/src/codegen/type/boolean_type.cpp @@ -6,12 +6,13 @@ // // Identification: src/codegen/type/boolean_type.cpp // -// Copyright (c) 2015-2017, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// #include "codegen/type/boolean_type.h" +#include "codegen/proxy/numeric_functions_proxy.h" #include "codegen/proxy/values_runtime_proxy.h" #include "codegen/type/integer_type.h" #include "codegen/type/varchar_type.h" @@ -326,7 +327,7 @@ void Boolean::GetTypeForMaterialization(CodeGen &codegen, llvm::Type *&val_type, llvm::Function *Boolean::GetInputFunction( CodeGen &codegen, UNUSED_ATTRIBUTE const Type &type) const { - return ValuesRuntimeProxy::InputBoolean.GetFunction(codegen); + return NumericFunctionsProxy::InputBoolean.GetFunction(codegen); } llvm::Function *Boolean::GetOutputFunction( diff --git a/src/codegen/type/date_type.cpp b/src/codegen/type/date_type.cpp index 26342c23db9..5b541c32dcd 100644 --- a/src/codegen/type/date_type.cpp +++ b/src/codegen/type/date_type.cpp @@ -14,6 +14,7 @@ #include "codegen/lang/if.h" #include "codegen/value.h" +#include "codegen/proxy/date_functions_proxy.h" #include "codegen/proxy/values_runtime_proxy.h" #include "codegen/type/boolean_type.h" #include "codegen/type/integer_type.h" @@ -189,9 +190,8 @@ void Date::GetTypeForMaterialization(CodeGen &codegen, llvm::Type *&val_type, } llvm::Function *Date::GetInputFunction( - UNUSED_ATTRIBUTE CodeGen &codegen, - UNUSED_ATTRIBUTE const Type &type) const { - throw NotImplementedException{"Date inputs not supported yet"}; + CodeGen &codegen, UNUSED_ATTRIBUTE const Type &type) const { + return DateFunctionsProxy::InputDate.GetFunction(codegen); } llvm::Function *Date::GetOutputFunction( diff --git a/src/codegen/type/decimal_type.cpp b/src/codegen/type/decimal_type.cpp index 92cc7ec5b6a..7f527092a1d 100644 --- a/src/codegen/type/decimal_type.cpp +++ b/src/codegen/type/decimal_type.cpp @@ -13,7 +13,7 @@ #include "codegen/type/decimal_type.h" #include "codegen/lang/if.h" -#include "codegen/proxy/decimal_functions_proxy.h" +#include "codegen/proxy/numeric_functions_proxy.h" #include "codegen/proxy/values_runtime_proxy.h" #include "codegen/type/boolean_type.h" #include "codegen/type/integer_type.h" @@ -194,7 +194,7 @@ struct Abs : public TypeSystem::UnaryOperatorHandleNull { UNUSED_ATTRIBUTE const TypeSystem::InvocationContext &ctx) const override { llvm::Value *raw_ret = - codegen.Call(DecimalFunctionsProxy::Abs, {val.GetValue()}); + codegen.Call(NumericFunctionsProxy::Abs, {val.GetValue()}); return Value{Decimal::Instance(), raw_ret}; } }; @@ -213,7 +213,7 @@ struct Floor : public TypeSystem::UnaryOperatorHandleNull { UNUSED_ATTRIBUTE const TypeSystem::InvocationContext &ctx) const override { llvm::Value *raw_ret = - codegen.Call(DecimalFunctionsProxy::Floor, {val.GetValue()}); + codegen.Call(NumericFunctionsProxy::Floor, {val.GetValue()}); return Value{Decimal::Instance(), raw_ret}; } }; @@ -232,7 +232,7 @@ struct Round : public TypeSystem::UnaryOperatorHandleNull { UNUSED_ATTRIBUTE const TypeSystem::InvocationContext &ctx) const override { llvm::Value *raw_ret = - codegen.Call(DecimalFunctionsProxy::Round, {val.GetValue()}); + codegen.Call(NumericFunctionsProxy::Round, {val.GetValue()}); return Value{Decimal::Instance(), raw_ret}; } }; @@ -252,7 +252,7 @@ struct Ceil : public TypeSystem::UnaryOperatorHandleNull { const override { PELOTON_ASSERT(SupportsType(val.GetType())); - auto *result = codegen.Call(DecimalFunctionsProxy::Ceil, {val.GetValue()}); + auto *result = codegen.Call(NumericFunctionsProxy::Ceil, {val.GetValue()}); return Value{Decimal::Instance(), result}; } @@ -561,7 +561,7 @@ void Decimal::GetTypeForMaterialization(CodeGen &codegen, llvm::Type *&val_type, llvm::Function *Decimal::GetInputFunction( CodeGen &codegen, UNUSED_ATTRIBUTE const Type &type) const { // TODO: We should be using the precision/scale in the output function - return ValuesRuntimeProxy::InputDecimal.GetFunction(codegen); + return NumericFunctionsProxy::InputDecimal.GetFunction(codegen); } llvm::Function *Decimal::GetOutputFunction( diff --git a/src/codegen/type/integer_type.cpp b/src/codegen/type/integer_type.cpp index 92809098341..6d9a61ebde1 100644 --- a/src/codegen/type/integer_type.cpp +++ b/src/codegen/type/integer_type.cpp @@ -6,13 +6,14 @@ // // Identification: src/codegen/type/integer_type.cpp // -// Copyright (c) 2015-2017, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// #include "codegen/type/integer_type.h" #include "codegen/lang/if.h" +#include "codegen/proxy/numeric_functions_proxy.h" #include "codegen/proxy/values_runtime_proxy.h" #include "codegen/type/boolean_type.h" #include "codegen/type/decimal_type.h" @@ -595,7 +596,7 @@ void Integer::GetTypeForMaterialization(CodeGen &codegen, llvm::Type *&val_type, llvm::Function *Integer::GetInputFunction( CodeGen &codegen, UNUSED_ATTRIBUTE const Type &type) const { - return ValuesRuntimeProxy::InputInteger.GetFunction(codegen); + return NumericFunctionsProxy::InputInteger.GetFunction(codegen); } llvm::Function *Integer::GetOutputFunction( diff --git a/src/codegen/type/smallint_type.cpp b/src/codegen/type/smallint_type.cpp index e0f31561c95..b645af00ffe 100644 --- a/src/codegen/type/smallint_type.cpp +++ b/src/codegen/type/smallint_type.cpp @@ -6,7 +6,7 @@ // // Identification: src/codegen/type/smallint_type.cpp // -// Copyright (c) 2015-2017, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -14,6 +14,7 @@ #include "codegen/lang/if.h" #include "codegen/value.h" +#include "codegen/proxy/numeric_functions_proxy.h" #include "codegen/proxy/values_runtime_proxy.h" #include "codegen/type/boolean_type.h" #include "codegen/type/decimal_type.h" @@ -606,7 +607,7 @@ void SmallInt::GetTypeForMaterialization(CodeGen &codegen, llvm::Function *SmallInt::GetInputFunction( CodeGen &codegen, UNUSED_ATTRIBUTE const Type &type) const { - return ValuesRuntimeProxy::InputSmallInt.GetFunction(codegen); + return NumericFunctionsProxy::InputSmallInt.GetFunction(codegen); } llvm::Function *SmallInt::GetOutputFunction( diff --git a/src/codegen/type/tinyint_type.cpp b/src/codegen/type/tinyint_type.cpp index 24cad11558c..ab82f4982a2 100644 --- a/src/codegen/type/tinyint_type.cpp +++ b/src/codegen/type/tinyint_type.cpp @@ -6,7 +6,7 @@ // // Identification: src/codegen/type/tinyint_type.cpp // -// Copyright (c) 2015-2017, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -14,6 +14,7 @@ #include "codegen/lang/if.h" #include "codegen/value.h" +#include "codegen/proxy/numeric_functions_proxy.h" #include "codegen/proxy/values_runtime_proxy.h" #include "codegen/type/boolean_type.h" #include "codegen/type/decimal_type.h" @@ -601,7 +602,7 @@ void TinyInt::GetTypeForMaterialization(CodeGen &codegen, llvm::Type *&val_type, llvm::Function *TinyInt::GetInputFunction( CodeGen &codegen, UNUSED_ATTRIBUTE const Type &type) const { - return ValuesRuntimeProxy::InputTinyInt.GetFunction(codegen); + return NumericFunctionsProxy::InputTinyInt.GetFunction(codegen); } llvm::Function *TinyInt::GetOutputFunction( diff --git a/src/codegen/type/varbinary_type.cpp b/src/codegen/type/varbinary_type.cpp index bcbf0c8a1de..6d80b924243 100644 --- a/src/codegen/type/varbinary_type.cpp +++ b/src/codegen/type/varbinary_type.cpp @@ -13,6 +13,7 @@ #include "codegen/type/varbinary_type.h" #include "codegen/value.h" +#include "codegen/proxy/string_functions_proxy.h" #include "codegen/proxy/values_runtime_proxy.h" #include "codegen/type/boolean_type.h" #include "codegen/type/integer_type.h" @@ -52,7 +53,7 @@ struct CompareVarbinary : public TypeSystem::ExpensiveComparisonHandleNull { // Setup the function arguments and invoke the call std::vector args = {left.GetValue(), left.GetLength(), right.GetValue(), right.GetLength()}; - return codegen.Call(ValuesRuntimeProxy::CompareStrings, args); + return codegen.Call(StringFunctionsProxy::CompareStrings, args); } Value CompareLtImpl(CodeGen &codegen, const Value &left, diff --git a/src/codegen/type/varchar_type.cpp b/src/codegen/type/varchar_type.cpp index 001b6afaca9..dc3ab961f3d 100644 --- a/src/codegen/type/varchar_type.cpp +++ b/src/codegen/type/varchar_type.cpp @@ -52,7 +52,7 @@ struct CompareVarchar : public TypeSystem::ExpensiveComparisonHandleNull { // Setup the function arguments and invoke the call std::vector args = {left.GetValue(), left.GetLength(), right.GetValue(), right.GetLength()}; - return codegen.Call(ValuesRuntimeProxy::CompareStrings, args); + return codegen.Call(StringFunctionsProxy::CompareStrings, args); } Value CompareLtImpl(CodeGen &codegen, const Value &left, diff --git a/src/codegen/values_runtime.cpp b/src/codegen/values_runtime.cpp index e23e552813a..a3c41196762 100644 --- a/src/codegen/values_runtime.cpp +++ b/src/codegen/values_runtime.cpp @@ -12,8 +12,6 @@ #include "codegen/values_runtime.h" -#include - #include "codegen/runtime_functions.h" #include "codegen/type/type.h" #include "type/abstract_pool.h" @@ -101,267 +99,5 @@ void ValuesRuntime::OutputVarbinary(char *values, uint32_t idx, const char *ptr, peloton::type::ValueFactory::GetVarbinaryValue(bin_ptr, len, false)); } -//////////////////////////////////////////////////////////////////////////////// -/// -/// Input functions -/// -//////////////////////////////////////////////////////////////////////////////// - -namespace { - -/** - * Skip all leading and trailing whitespace from the string bounded by the - * provided pointers. This function will modify the input pointers to point to - * the first non-space character at the start and end of the input string. - * - * @param[in,out] left A pointer to the leftmost character in the input string - * @param[in,out] right A pointer to the rightmost character in the input string - */ -void TrimLeftRight(const char *&left, const char *&right) { - while (*left == ' ') { - left++; - } - while (right > left && *(right - 1) == ' ') { - right--; - } -} - -/** - * Convert the provided input string into a integral number. This function - * handles leading whitespace and leading negative (-) or positive (+) signs. - * Additionally, it performs a bounds check to ensure the number falls into the - * valid range of numbers for the given type. - * - * @tparam T The integral type (int8_t, int16_t, int32_t, int64_t) - * @param ptr A pointer to the start of the input string - * @param len The length of the input string - * @return The numeric interpretation of the input string - */ -template -typename std::enable_if::value, T>::type ToNum( - const char *ptr, uint32_t len) { - if (len == 0) { - RuntimeFunctions::ThrowInvalidInputStringException(); - __builtin_unreachable(); - } - - const char *start = ptr; - const char *end = start + len; - - // Trim leading and trailing whitespace - TrimLeftRight(start, end); - - // Check negative or positive sign - bool negative = false; - if (*start == '-') { - negative = true; - start++; - } else if (*start == '+') { - start++; - } - - // Convert - int64_t num = 0; - while (start != end) { - if (*start < '0' || *start > '9') { - RuntimeFunctions::ThrowInvalidInputStringException(); - __builtin_unreachable(); - } - - num = (num * 10) + (*start - '0'); - - start++; - } - - // Negate number if we need to - if (negative) { - num = -num; - } - - // Range check - if (num <= std::numeric_limits::min() || - num >= std::numeric_limits::max()) { - RuntimeFunctions::ThrowOverflowException(); - __builtin_unreachable(); - } - - // Done - return static_cast(num); -} - -template -typename std::enable_if::value, T>::type ToNum( - const char *ptr, uint32_t len) { - if (len == 0) { - RuntimeFunctions::ThrowInvalidInputStringException(); - __builtin_unreachable(); - } - - // TODO(pmenon): Optimize me later - char *end = nullptr; - auto ret = std::strtod(ptr, &end); - - if (unlikely_branch(end == ptr)) { - if (errno == ERANGE) { - RuntimeFunctions::ThrowOverflowException(); - __builtin_unreachable(); - } else { - RuntimeFunctions::ThrowInvalidInputStringException(); - __builtin_unreachable(); - } - } - - // Done - return static_cast(ret); -} - -} // namespace - -bool ValuesRuntime::InputBoolean(UNUSED_ATTRIBUTE const type::Type &type, - const char *ptr, uint32_t len) { - PELOTON_ASSERT(ptr != nullptr && "Input is assumed to be non-NULL"); - - if (len == 0) { - RuntimeFunctions::ThrowInvalidInputStringException(); - __builtin_unreachable(); - } - - const char *start = ptr, *end = ptr + len; - - // Trim leading and trailing whitespace - TrimLeftRight(start, end); - - // - uint64_t trimmed_len = end - start; - - // Check cases - switch (*start) { - case 't': - case 'T': { - static constexpr char kTrue[] = "true"; - std::cout << sizeof(kTrue) << std::endl; - if (strncasecmp(start, kTrue, trimmed_len) == 0) { - return true; - } - break; - } - case 'f': - case 'F': { - static constexpr char kFalse[] = "false"; - if (strncasecmp(start, kFalse, trimmed_len) == 0) { - return false; - } - break; - } - case 'y': - case 'Y': { - static constexpr char kYes[] = "yes"; - if (strncasecmp(start, kYes, trimmed_len) == 0) { - return true; - } - break; - } - case 'n': - case 'N': { - static constexpr char kNo[] = "no"; - if (strncasecmp(start, kNo, trimmed_len) == 0) { - return false; - } - break; - } - case 'o': - case 'O': { - // 'o' not enough to distinguish between on/off - static constexpr char kOff[] = "off"; - static constexpr char kOn[] = "on"; - if (strncasecmp(start, kOff, (trimmed_len > 3 ? trimmed_len : 3)) == 0) { - return false; - } else if (strncasecmp(start, kOn, (trimmed_len > 2 ? trimmed_len : 2)) == - 0) { - return true; - } - break; - } - case '0': { - if (trimmed_len == 1) { - return false; - } else { - return true; - } - } - case '1': { - if (trimmed_len == 1) { - return true; - } else { - return false; - } - } - default: { break; } - } - - // Error - RuntimeFunctions::ThrowInvalidInputStringException(); - __builtin_unreachable(); -} - -int8_t ValuesRuntime::InputTinyInt(UNUSED_ATTRIBUTE const type::Type &type, - const char *ptr, uint32_t len) { - PELOTON_ASSERT(ptr != nullptr && "Input is assumed to be non-NULL"); - return ToNum(ptr, len); -} - -int16_t ValuesRuntime::InputSmallInt(UNUSED_ATTRIBUTE const type::Type &type, - const char *ptr, uint32_t len) { - PELOTON_ASSERT(ptr != nullptr && "Input is assumed to be non-NULL"); - return ToNum(ptr, len); -} - -int32_t ValuesRuntime::InputInteger(UNUSED_ATTRIBUTE const type::Type &type, - const char *ptr, uint32_t len) { - PELOTON_ASSERT(ptr != nullptr && "Input is assumed to be non-NULL"); - return ToNum(ptr, len); -} - -int64_t ValuesRuntime::InputBigInt(UNUSED_ATTRIBUTE const type::Type &type, - const char *ptr, uint32_t len) { - PELOTON_ASSERT(ptr != nullptr && "Input is assumed to be non-NULL"); - return ToNum(ptr, len); -} - -double ValuesRuntime::InputDecimal(UNUSED_ATTRIBUTE const type::Type &type, - const char *ptr, uint32_t len) { - PELOTON_ASSERT(ptr != nullptr && "Input is assumed to be non-NULL"); - return ToNum(ptr, len); -} - -//////////////////////////////////////////////////////////////////////////////// -/// -/// String comparison -/// -//////////////////////////////////////////////////////////////////////////////// - -int32_t ValuesRuntime::CompareStrings(const char *str1, uint32_t len1, - const char *str2, uint32_t len2) { - return peloton::type::TypeUtil::CompareStrings(str1, len1, str2, len2); -} - -void ValuesRuntime::WriteVarlen(const char *data, uint32_t len, char *buf, - peloton::type::AbstractPool &pool) { - struct Varlen { - uint32_t len; - char data[0]; - }; - - // Allocate memory for the Varlen object - auto *area = static_cast(pool.Allocate(sizeof(uint32_t) + len)); - - // Populate it - area->len = len; - PELOTON_MEMCPY(area->data, data, len); - - // Store a pointer to the Varlen object into the target memory space - *reinterpret_cast(buf) = area; -} - } // namespace codegen } // namespace peloton diff --git a/src/function/date_functions.cpp b/src/function/date_functions.cpp index 233259844c8..9c676690a13 100644 --- a/src/function/date_functions.cpp +++ b/src/function/date_functions.cpp @@ -18,7 +18,6 @@ #include #include -#include "common/logger.h" #include "common/internal_types.h" #include "type/value.h" #include "type/value_factory.h" @@ -26,30 +25,28 @@ namespace peloton { namespace function { -// This now is not what postgres does. -// Postgres is returning the time when the transaction begins -// We are here intead generating a new time when this function -// is called +// This implementation of Now() is **not** what postgres does. Postgres is +// returning the time when the transaction begins. We are here instead +// generating a new time when this function is called. int64_t DateFunctions::Now() { uint64_t time_stamp; struct timeval tv; struct tm *time_info; - uint64_t hour_min_sec_base = 1000000; //us to sec + uint64_t hour_min_sec_base = 1000000; // us to sec uint64_t year_base = hour_min_sec_base * 100000; - uint64_t day_base = year_base * 10000 * 27; // skip the time zone + uint64_t day_base = year_base * 10000 * 27; // skip the time zone uint64_t month_base = day_base * 32; gettimeofday(&tv, NULL); time_info = gmtime(&(tv.tv_sec)); - uint32_t hour_min_sec = time_info->tm_hour * 3600 + - time_info->tm_min * 60 + - time_info->tm_sec; + uint32_t hour_min_sec = + time_info->tm_hour * 3600 + time_info->tm_min * 60 + time_info->tm_sec; // EPOCH time start from 1970 uint16_t year = time_info->tm_year + 1900; uint16_t day = time_info->tm_mday; - uint16_t month = time_info->tm_mon + 1; // tm_mon is from 0 - 11 + uint16_t month = time_info->tm_mon + 1; // tm_mon is from 0 - 11 time_stamp = tv.tv_usec; time_stamp += hour_min_sec_base * hour_min_sec; @@ -60,10 +57,16 @@ int64_t DateFunctions::Now() { return time_stamp; } -type::Value DateFunctions::_Now(const UNUSED_ATTRIBUTE std::vector &args) { - PELOTON_ASSERT(args.size() == 0); - int64_t now = Now(); - return type::ValueFactory::GetTimestampValue(now); +type::Value DateFunctions::_Now( + UNUSED_ATTRIBUTE const std::vector &args) { + PELOTON_ASSERT(args.empty()); + return type::ValueFactory::GetTimestampValue(Now()); +} + +int32_t DateFunctions::InputDate( + UNUSED_ATTRIBUTE const codegen::type::Type &type, + UNUSED_ATTRIBUTE const char *data, UNUSED_ATTRIBUTE uint32_t len) { + return 0; } } // namespace expression diff --git a/src/function/decimal_functions.cpp b/src/function/decimal_functions.cpp deleted file mode 100644 index b722993b4d0..00000000000 --- a/src/function/decimal_functions.cpp +++ /dev/null @@ -1,152 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Peloton -// -// decimal_functions.cpp -// -// Identification: src/function/decimal_functions.cpp -// -// Copyright (c) 2015-2017, Carnegie Mellon University Database Group -// -//===----------------------------------------------------------------------===// - -#include "function/decimal_functions.h" -#include "type/value_factory.h" - -namespace peloton { -namespace function { - -// Get square root of the value -type::Value DecimalFunctions::Sqrt(const std::vector &args) { - PELOTON_ASSERT(args.size() == 1); - if (args[0].IsNull()) { - return type::ValueFactory::GetNullValueByType(type::TypeId::DECIMAL); - } - return args[0].Sqrt(); -} - -// Get Abs of value -type::Value DecimalFunctions::_Abs(const std::vector &args) { - PELOTON_ASSERT(args.size() == 1); - if (args[0].IsNull()) { - return type::ValueFactory::GetNullValueByType(type::TypeId::DECIMAL); - } - switch (args[0].GetElementType()) { - case type::TypeId::DECIMAL: - { - double result; - result = Abs(args[0].GetAs()); - return type::ValueFactory::GetDecimalValue(result); - } - break; - case type::TypeId::INTEGER: - { - int32_t result; - result = abs(args[0].GetAs()); - return type::ValueFactory::GetIntegerValue(result); - break; - } - case type::TypeId::BIGINT: - { - int64_t result; - result = std::abs(args[0].GetAs()); - return type::ValueFactory::GetBigIntValue(result); - } - break; - case type::TypeId::SMALLINT: - { - int16_t result; - result = abs(args[0].GetAs()); - return type::ValueFactory::GetSmallIntValue(result); - } - break; - case type::TypeId::TINYINT: - { - int8_t result; - result = abs(args[0].GetAs()); - return type::ValueFactory::GetTinyIntValue(result); - } - break; - default: - return type::ValueFactory::GetNullValueByType(type::TypeId::DECIMAL); - } -} - -double DecimalFunctions::Abs(const double args) { return fabs(args); } - -// Get ceiling of value -type::Value DecimalFunctions::_Ceil(const std::vector &args) { - PELOTON_ASSERT(args.size() == 1); - if (args[0].IsNull()) { - return type::ValueFactory::GetNullValueByType(type::TypeId::DECIMAL); - } - double result; - switch (args[0].GetElementType()) { - case type::TypeId::DECIMAL: - result = Ceil(args[0].GetAs()); - break; - case type::TypeId::INTEGER: - result = args[0].GetAs(); - break; - case type::TypeId::BIGINT: - result = args[0].GetAs(); - break; - case type::TypeId::SMALLINT: - result = args[0].GetAs(); - break; - case type::TypeId::TINYINT: - result = args[0].GetAs(); - break; - default: - return type::ValueFactory::GetNullValueByType(type::TypeId::DECIMAL); - } - return type::ValueFactory::GetDecimalValue(result); -} - -double DecimalFunctions::Ceil(const double args) { return ceil(args); } - -// Get floor value -type::Value DecimalFunctions::_Floor(const std::vector &args) { - PELOTON_ASSERT(args.size() == 1); - if (args[0].IsNull()) { - return type::ValueFactory::GetNullValueByType(type::TypeId::DECIMAL); - } - double res; - switch(args[0].GetElementType()) { - case type::TypeId::DECIMAL: - res = Floor(args[0].GetAs()); - break; - case type::TypeId::INTEGER: - res = args[0].GetAs(); - break; - case type::TypeId::BIGINT: - res = args[0].GetAs(); - break; - case type::TypeId::SMALLINT: - res = args[0].GetAs(); - break; - case type::TypeId::TINYINT: - res = args[0].GetAs(); - break; - default: - return type::ValueFactory::GetNullValueByType(type::TypeId::DECIMAL); - } - return type::ValueFactory::GetDecimalValue(res); -} - -double DecimalFunctions::Floor(const double val) { return floor(val); } - -// Round to nearest integer -type::Value DecimalFunctions::_Round(const std::vector &args) { - PELOTON_ASSERT(args.size() == 1); - if (args[0].IsNull()) { - return type::ValueFactory::GetNullValueByType(type::TypeId::DECIMAL); - } - return type::ValueFactory::GetDecimalValue(Round(args[0].GetAs())); -} - -double DecimalFunctions::Round(double arg) { return round(arg); } - - -} // namespace function -} // namespace peloton diff --git a/src/function/numeric_functions.cpp b/src/function/numeric_functions.cpp new file mode 100644 index 00000000000..50a00ee516a --- /dev/null +++ b/src/function/numeric_functions.cpp @@ -0,0 +1,417 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// numeric_functions.cpp +// +// Identification: src/function/numeric_functions.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include "function/numeric_functions.h" + +#include "codegen/type/type.h" +#include "codegen/runtime_functions.h" +#include "type/value.h" +#include "type/value_factory.h" + +namespace peloton { +namespace function { + +//////////////////////////////////////////////////////////////////////////////// +/// +/// Square root +/// +//////////////////////////////////////////////////////////////////////////////// + +double NumericFunctions::ISqrt(uint32_t num) { + return std::sqrt(num); +} + +double NumericFunctions::DSqrt(double num) { return std::sqrt(num); } + +type::Value NumericFunctions::Sqrt(const std::vector &args) { + PELOTON_ASSERT(args.size() == 1); + if (args[0].IsNull()) { + return type::ValueFactory::GetNullValueByType(type::TypeId::DECIMAL); + } + return args[0].Sqrt(); +} + +//////////////////////////////////////////////////////////////////////////////// +/// +/// Absolute value +/// +//////////////////////////////////////////////////////////////////////////////// + +double NumericFunctions::Abs(const double args) { return fabs(args); } + +// Get Abs of value +type::Value NumericFunctions::_Abs(const std::vector &args) { + PELOTON_ASSERT(args.size() == 1); + if (args[0].IsNull()) { + return type::ValueFactory::GetNullValueByType(type::TypeId::DECIMAL); + } + switch (args[0].GetElementType()) { + case type::TypeId::DECIMAL: { + double result; + result = Abs(args[0].GetAs()); + return type::ValueFactory::GetDecimalValue(result); + } + case type::TypeId::INTEGER: { + int32_t result; + result = abs(args[0].GetAs()); + return type::ValueFactory::GetIntegerValue(result); + } + case type::TypeId::BIGINT: { + int64_t result; + result = std::abs(args[0].GetAs()); + return type::ValueFactory::GetBigIntValue(result); + } + case type::TypeId::SMALLINT: { + int16_t result; + result = abs(args[0].GetAs()); + return type::ValueFactory::GetSmallIntValue(result); + } + case type::TypeId::TINYINT: { + int8_t result; + result = abs(args[0].GetAs()); + return type::ValueFactory::GetTinyIntValue(result); + } + default: { + return type::ValueFactory::GetNullValueByType(type::TypeId::DECIMAL); + } + } +} + +//////////////////////////////////////////////////////////////////////////////// +/// +/// Ceiling value +/// +//////////////////////////////////////////////////////////////////////////////// + +double NumericFunctions::Ceil(const double args) { return ceil(args); } + +type::Value NumericFunctions::_Ceil(const std::vector &args) { + PELOTON_ASSERT(args.size() == 1); + if (args[0].IsNull()) { + return type::ValueFactory::GetNullValueByType(type::TypeId::DECIMAL); + } + double result; + switch (args[0].GetElementType()) { + case type::TypeId::DECIMAL: + result = Ceil(args[0].GetAs()); + break; + case type::TypeId::INTEGER: + result = args[0].GetAs(); + break; + case type::TypeId::BIGINT: + result = args[0].GetAs(); + break; + case type::TypeId::SMALLINT: + result = args[0].GetAs(); + break; + case type::TypeId::TINYINT: + result = args[0].GetAs(); + break; + default: + return type::ValueFactory::GetNullValueByType(type::TypeId::DECIMAL); + } + return type::ValueFactory::GetDecimalValue(result); +} + +//////////////////////////////////////////////////////////////////////////////// +/// +/// Floor value +/// +//////////////////////////////////////////////////////////////////////////////// + +double NumericFunctions::Floor(const double val) { return floor(val); } + +type::Value NumericFunctions::_Floor(const std::vector &args) { + PELOTON_ASSERT(args.size() == 1); + if (args[0].IsNull()) { + return type::ValueFactory::GetNullValueByType(type::TypeId::DECIMAL); + } + double res; + switch (args[0].GetElementType()) { + case type::TypeId::DECIMAL: + res = Floor(args[0].GetAs()); + break; + case type::TypeId::INTEGER: + res = args[0].GetAs(); + break; + case type::TypeId::BIGINT: + res = args[0].GetAs(); + break; + case type::TypeId::SMALLINT: + res = args[0].GetAs(); + break; + case type::TypeId::TINYINT: + res = args[0].GetAs(); + break; + default: + return type::ValueFactory::GetNullValueByType(type::TypeId::DECIMAL); + } + return type::ValueFactory::GetDecimalValue(res); +} + +//////////////////////////////////////////////////////////////////////////////// +/// +/// Rounding +/// +//////////////////////////////////////////////////////////////////////////////// + +double NumericFunctions::Round(double arg) { return round(arg); } + +type::Value NumericFunctions::_Round(const std::vector &args) { + PELOTON_ASSERT(args.size() == 1); + if (args[0].IsNull()) { + return type::ValueFactory::GetNullValueByType(type::TypeId::DECIMAL); + } + return type::ValueFactory::GetDecimalValue(Round(args[0].GetAs())); +} + +//////////////////////////////////////////////////////////////////////////////// +/// +/// Input functions +/// +//////////////////////////////////////////////////////////////////////////////// + +namespace { + +/** + * Skip all leading and trailing whitespace from the string bounded by the + * provided pointers. This function will modify the input pointers to point to + * the first non-whitespace space character at the start and end of the input + * string. + * + * @param[in,out] left Pointer to the left-most character in the input string + * @param[in,out] right Pointer to the right-most character in the input string + */ +void TrimLeftRight(const char *&left, const char *&right) { + while (*left == ' ') { + left++; + } + while (right > left && *(right - 1) == ' ') { + right--; + } +} + +/** + * Convert the provided input string into an integral number. This function + * handles leading whitespace and leading negative (-) or positive (+) signs. + * Additionally, it performs a bounds check to ensure the number falls into the + * valid range of numbers for the given type. + * + * @tparam T The integral type (int8_t, int16_t, int32_t, int64_t) + * @param ptr A pointer to the start of the input string + * @param len The length of the input string + * @return The numeric interpretation of the input string + */ +template +T ParseInteger(const char *ptr, uint32_t len) { + static_assert(std::is_integral::value, + "Must provide integer-type when calling ParseInteger"); + + if (len == 0) { + codegen::RuntimeFunctions::ThrowInvalidInputStringException(); + __builtin_unreachable(); + } + + const char *start = ptr; + const char *end = start + len; + + // Trim leading and trailing whitespace + TrimLeftRight(start, end); + + // Check negative or positive sign + bool negative = false; + if (*start == '-') { + negative = true; + start++; + } else if (*start == '+') { + start++; + } + + // Convert + int64_t num = 0; + while (start != end) { + if (*start < '0' || *start > '9') { + codegen::RuntimeFunctions::ThrowInvalidInputStringException(); + __builtin_unreachable(); + } + + num = (num * 10) + (*start - '0'); + + start++; + } + + PELOTON_ASSERT(start == end); + + // Negate number if we need to + if (negative) { + num = -num; + } + + // Range check + if (num <= std::numeric_limits::min() || + num >= std::numeric_limits::max()) { + codegen::RuntimeFunctions::ThrowOverflowException(); + __builtin_unreachable(); + } + + // Done + return static_cast(num); +} + +} // namespace + +bool NumericFunctions::InputBoolean( + UNUSED_ATTRIBUTE const codegen::type::Type &type, const char *ptr, + uint32_t len) { + PELOTON_ASSERT(ptr != nullptr && "Input is assumed to be non-NULL"); + + if (len == 0) { + codegen::RuntimeFunctions::ThrowInvalidInputStringException(); + __builtin_unreachable(); + } + + const char *start = ptr, *end = ptr + len; + + // Trim leading and trailing whitespace + TrimLeftRight(start, end); + + // + uint64_t trimmed_len = end - start; + + // Check cases + switch (*start) { + case 't': + case 'T': { + static constexpr char kTrue[] = "true"; + if (strncasecmp(start, kTrue, trimmed_len) == 0) { + return true; + } + break; + } + case 'f': + case 'F': { + static constexpr char kFalse[] = "false"; + if (strncasecmp(start, kFalse, trimmed_len) == 0) { + return false; + } + break; + } + case 'y': + case 'Y': { + static constexpr char kYes[] = "yes"; + if (strncasecmp(start, kYes, trimmed_len) == 0) { + return true; + } + break; + } + case 'n': + case 'N': { + static constexpr char kNo[] = "no"; + if (strncasecmp(start, kNo, trimmed_len) == 0) { + return false; + } + break; + } + case 'o': + case 'O': { + // 'o' not enough to distinguish between on/off + static constexpr char kOff[] = "off"; + static constexpr char kOn[] = "on"; + if (strncasecmp(start, kOff, (trimmed_len > 3 ? trimmed_len : 3)) == 0) { + return false; + } else if (strncasecmp(start, kOn, (trimmed_len > 2 ? trimmed_len : 2)) == + 0) { + return true; + } + break; + } + case '0': { + if (trimmed_len == 1) { + return false; + } else { + return true; + } + } + case '1': { + if (trimmed_len == 1) { + return true; + } else { + return false; + } + } + default: { break; } + } + + // Error + codegen::RuntimeFunctions::ThrowInvalidInputStringException(); + __builtin_unreachable(); +} + +int8_t NumericFunctions::InputTinyInt( + UNUSED_ATTRIBUTE const codegen::type::Type &type, const char *ptr, + uint32_t len) { + PELOTON_ASSERT(ptr != nullptr && "Input is assumed to be non-NULL"); + return ParseInteger(ptr, len); +} + +int16_t NumericFunctions::InputSmallInt( + UNUSED_ATTRIBUTE const codegen::type::Type &type, const char *ptr, + uint32_t len) { + PELOTON_ASSERT(ptr != nullptr && "Input is assumed to be non-NULL"); + return ParseInteger(ptr, len); +} + +int32_t NumericFunctions::InputInteger( + UNUSED_ATTRIBUTE const codegen::type::Type &type, const char *ptr, + uint32_t len) { + PELOTON_ASSERT(ptr != nullptr && "Input is assumed to be non-NULL"); + return ParseInteger(ptr, len); +} + +int64_t NumericFunctions::InputBigInt( + UNUSED_ATTRIBUTE const codegen::type::Type &type, const char *ptr, + uint32_t len) { + PELOTON_ASSERT(ptr != nullptr && "Input is assumed to be non-NULL"); + return ParseInteger(ptr, len); +} + +double NumericFunctions::InputDecimal( + UNUSED_ATTRIBUTE const codegen::type::Type &type, const char *ptr, + uint32_t len) { + PELOTON_ASSERT(ptr != nullptr && "Input is assumed to be non-NULL"); + if (len == 0) { + codegen::RuntimeFunctions::ThrowInvalidInputStringException(); + __builtin_unreachable(); + } + + // We don't trim because std::strtod() does the trimming for us + + // TODO(pmenon): Optimize me later + char *end = nullptr; + double ret = std::strtod(ptr, &end); + + if (unlikely_branch(end == ptr)) { + if (errno == ERANGE) { + codegen::RuntimeFunctions::ThrowOverflowException(); + __builtin_unreachable(); + } else { + codegen::RuntimeFunctions::ThrowInvalidInputStringException(); + __builtin_unreachable(); + } + } + + // Done + return ret; +} + +} // namespace function +} // namespace peloton diff --git a/src/function/string_functions.cpp b/src/function/string_functions.cpp index 841a9ee6e15..2cf8d9f0b89 100644 --- a/src/function/string_functions.cpp +++ b/src/function/string_functions.cpp @@ -14,6 +14,8 @@ #include "common/macros.h" #include "executor/executor_context.h" +#include "type/type_util.h" +#include "type/abstract_pool.h" namespace peloton { namespace function { @@ -220,5 +222,28 @@ uint32_t StringFunctions::Length( return length; } +int32_t StringFunctions::CompareStrings(const char *str1, uint32_t len1, + const char *str2, uint32_t len2) { + return peloton::type::TypeUtil::CompareStrings(str1, len1, str2, len2); +} + +void StringFunctions::WriteString(const char *data, uint32_t len, char *buf, + peloton::type::AbstractPool &pool) { + struct Varlen { + uint32_t len; + char data[0]; + }; + + // Allocate memory for the Varlen object + auto *area = static_cast(pool.Allocate(sizeof(uint32_t) + len)); + + // Populate it + area->len = len; + PELOTON_MEMCPY(area->data, data, len); + + // Store a pointer to the Varlen object into the target memory space + *reinterpret_cast(buf) = area; +} + } // namespace function } // namespace peloton diff --git a/src/include/codegen/proxy/date_functions_proxy.h b/src/include/codegen/proxy/date_functions_proxy.h index 38f96b3cd38..7954afe72d3 100644 --- a/src/include/codegen/proxy/date_functions_proxy.h +++ b/src/include/codegen/proxy/date_functions_proxy.h @@ -6,7 +6,7 @@ // // Identification: src/include/codegen/proxy/date_functions_proxy.h // -// Copyright (c) 2015-2017, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -18,8 +18,11 @@ namespace peloton { namespace codegen { PROXY(DateFunctions) { - // Proxy everything in function::StringFunctions + // Utility functions DECLARE_METHOD(Now); + + // Input functions + DECLARE_METHOD(InputDate); }; } // namespace codegen diff --git a/src/include/codegen/proxy/decimal_functions_proxy.h b/src/include/codegen/proxy/numeric_functions_proxy.h similarity index 52% rename from src/include/codegen/proxy/decimal_functions_proxy.h rename to src/include/codegen/proxy/numeric_functions_proxy.h index 4d9b70a5671..b3a338e06a8 100644 --- a/src/include/codegen/proxy/decimal_functions_proxy.h +++ b/src/include/codegen/proxy/numeric_functions_proxy.h @@ -2,11 +2,11 @@ // // Peloton // -// decimal_functions_proxy.h +// numeric_functions_proxy.h // -// Identification: src/include/codegen/proxy/decimal_functions_proxy.h +// Identification: src/include/codegen/proxy/numeric_functions_proxy.h // -// Copyright (c) 2015-2017, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -17,13 +17,20 @@ namespace peloton { namespace codegen { -PROXY(DecimalFunctions) { - // Proxy everything in function::DecimalFunctions - +PROXY(NumericFunctions) { + // Utility functions DECLARE_METHOD(Abs); DECLARE_METHOD(Floor); DECLARE_METHOD(Round); DECLARE_METHOD(Ceil); + + // Input functions + DECLARE_METHOD(InputBoolean); + DECLARE_METHOD(InputTinyInt); + DECLARE_METHOD(InputSmallInt); + DECLARE_METHOD(InputInteger); + DECLARE_METHOD(InputBigInt); + DECLARE_METHOD(InputDecimal); }; } // namespace codegen diff --git a/src/include/codegen/proxy/string_functions_proxy.h b/src/include/codegen/proxy/string_functions_proxy.h index e9cf1c9c7fc..bff1911d0a2 100644 --- a/src/include/codegen/proxy/string_functions_proxy.h +++ b/src/include/codegen/proxy/string_functions_proxy.h @@ -29,6 +29,8 @@ PROXY(StringFunctions) { DECLARE_METHOD(RTrim); DECLARE_METHOD(Substr); DECLARE_METHOD(Repeat); + DECLARE_METHOD(CompareStrings); + DECLARE_METHOD(WriteString); }; PROXY(StrWithLen) { diff --git a/src/include/codegen/proxy/values_runtime_proxy.h b/src/include/codegen/proxy/values_runtime_proxy.h index 059f700d8c6..9868d518bac 100644 --- a/src/include/codegen/proxy/values_runtime_proxy.h +++ b/src/include/codegen/proxy/values_runtime_proxy.h @@ -29,17 +29,6 @@ PROXY(ValuesRuntime) { DECLARE_METHOD(OutputDecimal); DECLARE_METHOD(OutputVarchar); DECLARE_METHOD(OutputVarbinary); - - DECLARE_METHOD(InputBoolean); - DECLARE_METHOD(InputTinyInt); - DECLARE_METHOD(InputSmallInt); - DECLARE_METHOD(InputInteger); - DECLARE_METHOD(InputBigInt); - DECLARE_METHOD(InputDecimal); - - DECLARE_METHOD(CompareStrings); - - DECLARE_METHOD(WriteVarlen); }; } // namespace codegen diff --git a/src/include/codegen/values_runtime.h b/src/include/codegen/values_runtime.h index 905ead1fd68..fd5c26b0e78 100644 --- a/src/include/codegen/values_runtime.h +++ b/src/include/codegen/values_runtime.h @@ -28,12 +28,6 @@ class Type; class ValuesRuntime { public: - ////////////////////////////////////////////////////////////////////////////// - /// - /// Output functions - /// - ////////////////////////////////////////////////////////////////////////////// - // Write out the given boolean value into the array at the provided index static void OutputBoolean(char *values, uint32_t idx, bool val, bool is_null); @@ -65,53 +59,6 @@ class ValuesRuntime { // Write out the given varbinary value into the array at the provided index static void OutputVarbinary(char *values, uint32_t idx, const char *str, uint32_t len); - - ////////////////////////////////////////////////////////////////////////////// - /// - /// Input functions - //// - ////////////////////////////////////////////////////////////////////////////// - - static bool InputBoolean(const type::Type &type, const char *ptr, - uint32_t len); - - static int8_t InputTinyInt(const type::Type &type, const char *ptr, - uint32_t len); - - static int16_t InputSmallInt(const type::Type &type, const char *ptr, - uint32_t len); - - static int32_t InputInteger(const type::Type &type, const char *ptr, - uint32_t len); - - static int64_t InputBigInt(const type::Type &type, const char *ptr, - uint32_t len); - - static double InputDecimal(const type::Type &type, const char *ptr, - uint32_t len); - - /** - * Compare two strings, returning an integer value indicating their sort order - * - * @param str1 A pointer to the first string - * @param len1 The length of the first string - * @param str2 A pointer to the second string - * @param len2 The length of the second string - * @return - */ - static int32_t CompareStrings(const char *str1, uint32_t len1, - const char *str2, uint32_t len2); - - /** - * Write the provided variable length object into the target buffer. - * - * @param data The bytes we wish to serialize - * @param len The length of the byte array - * @param buf The target position we wish to write to - * @param pool A memory pool to source memory from - */ - static void WriteVarlen(const char *data, uint32_t len, char *buf, - peloton::type::AbstractPool &pool); }; } // namespace codegen diff --git a/src/include/common/container_tuple.h b/src/include/common/container_tuple.h index 29613067734..0d27a0da6f5 100644 --- a/src/include/common/container_tuple.h +++ b/src/include/common/container_tuple.h @@ -6,7 +6,7 @@ // // Identification: src/include/common/container_tuple.h // -// Copyright (c) 2015-16, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -17,6 +17,7 @@ #include #include +#include "catalog/schema.h" #include "common/abstract_tuple.h" #include "common/exception.h" #include "common/macros.h" diff --git a/src/include/function/date_functions.h b/src/include/function/date_functions.h index e5a6ca85cd6..fc5973ac95b 100644 --- a/src/include/function/date_functions.h +++ b/src/include/function/date_functions.h @@ -20,12 +20,34 @@ #include "type/value.h" namespace peloton { + +namespace codegen { +namespace type { +class Type; +} // namespace type +} // namespace codegen + namespace function { class DateFunctions { public: + /** + * Function used to return the current date/time. Normally called at the start + * of a transaction, and consistent throughout its duration. + * + * @return The current date at the time of invocation + */ static int64_t Now(); static type::Value _Now(const std::vector &args); + + /** + * + * @param data + * @param len + * @return + */ + static int32_t InputDate(const codegen::type::Type &type, const char *data, + uint32_t len); }; } // namespace function diff --git a/src/include/function/decimal_functions.h b/src/include/function/decimal_functions.h deleted file mode 100644 index f4373aa5750..00000000000 --- a/src/include/function/decimal_functions.h +++ /dev/null @@ -1,46 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Peloton -// -// decimal_functions.h -// -// Identification: src/include/function/decimal_functions.h -// -// Copyright (c) 2015-2017, Carnegie Mellon University Database Group -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include - -#include "type/value.h" - -namespace peloton { -namespace function { - -class DecimalFunctions { - public: - - // Abs - static double Abs(double arg); - static type::Value _Abs(const std::vector& args); - - // Sqrt - static type::Value Sqrt(const std::vector& args); - - // Floor - static double Floor(const double val); - static type::Value _Floor(const std::vector& args); - - // Round - static double Round(double arg); - static type::Value _Round(const std::vector& args); - - // Ceil - static double Ceil(const double args); - static type::Value _Ceil(const std::vector& args); -}; - -} // namespace function -} // namespace peloton diff --git a/src/include/function/numeric_functions.h b/src/include/function/numeric_functions.h new file mode 100644 index 00000000000..6a606caf5d5 --- /dev/null +++ b/src/include/function/numeric_functions.h @@ -0,0 +1,81 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// numeric_functions.h +// +// Identification: src/include/function/numeric_functions.h +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +namespace peloton { + +namespace codegen { +namespace type { +class Type; +} // namespace type +} // namespace codegen + +namespace type { +class Value; +} // namespace value + +namespace function { + +class NumericFunctions { + public: + // Abs + static double Abs(double arg); + static type::Value _Abs(const std::vector &args); + + // Sqrt + static double ISqrt(uint32_t num); + static double DSqrt(double num); + static type::Value Sqrt(const std::vector &args); + + // Floor + static double Floor(double val); + static type::Value _Floor(const std::vector &args); + + // Round + static double Round(double arg); + static type::Value _Round(const std::vector &args); + + // Ceil + static double Ceil(double args); + static type::Value _Ceil(const std::vector &args); + + ////////////////////////////////////////////////////////////////////////////// + /// + /// Input functions + /// + ////////////////////////////////////////////////////////////////////////////// + + static bool InputBoolean(const codegen::type::Type &type, const char *ptr, + uint32_t len); + + static int8_t InputTinyInt(const codegen::type::Type &type, const char *ptr, + uint32_t len); + + static int16_t InputSmallInt(const codegen::type::Type &type, const char *ptr, + uint32_t len); + + static int32_t InputInteger(const codegen::type::Type &type, const char *ptr, + uint32_t len); + + static int64_t InputBigInt(const codegen::type::Type &type, const char *ptr, + uint32_t len); + + static double InputDecimal(const codegen::type::Type &type, const char *ptr, + uint32_t len); +}; + +} // namespace function +} // namespace peloton diff --git a/src/include/function/string_functions.h b/src/include/function/string_functions.h index 2a209d0dee6..db79dc2409e 100644 --- a/src/include/function/string_functions.h +++ b/src/include/function/string_functions.h @@ -20,6 +20,10 @@ namespace executor { class ExecutorContext; } // namespace executor +namespace type { +class AbstractPool; +} // namespace type; + namespace function { class StringFunctions { @@ -74,6 +78,32 @@ class StringFunctions { // Length will return the number of characters in the given string static uint32_t Length(executor::ExecutorContext &ctx, const char *str, uint32_t length); + + /** + * Compare two (potentially empty) strings returning an integer value + * indicating their sort order. + * + * @param str1 A pointer to the first string + * @param len1 The length of the first string + * @param str2 A pointer to the second string + * @param len2 The length of the second string + * @return -1 if the first string is strictly less than the second; 0 if the + * two strings are equal; 1 if the second string is strictly greater than the + * second. + */ + static int32_t CompareStrings(const char *str1, uint32_t len1, + const char *str2, uint32_t len2); + + /** + * Write the provided variable length object into the target buffer. + * + * @param data The bytes we wish to serialize + * @param len The length of the byte array + * @param buf The target position we wish to write to + * @param pool A memory pool to source memory from + */ + static void WriteString(const char *data, uint32_t len, char *buf, + peloton::type::AbstractPool &pool); }; } // namespace function diff --git a/test/codegen/value_integrity_test.cpp b/test/codegen/value_integrity_test.cpp index 9c78ece4787..97429771708 100644 --- a/test/codegen/value_integrity_test.cpp +++ b/test/codegen/value_integrity_test.cpp @@ -17,7 +17,7 @@ #include "codegen/type/smallint_type.h" #include "codegen/type/integer_type.h" #include "codegen/type/bigint_type.h" -#include "codegen/values_runtime.h" +#include "function/numeric_functions.h" namespace peloton { namespace test { @@ -224,17 +224,18 @@ void TestInputIntegral( TEST_F(ValueIntegrityTest, InputIntegralTypesTest) { codegen::type::Type tinyint{type::TypeId::TINYINT, false}; - TestInputIntegral(tinyint, codegen::ValuesRuntime::InputTinyInt, + TestInputIntegral(tinyint, function::NumericFunctions::InputTinyInt, {{"-126", -126}, {"126", 126}}); codegen::type::Type smallint{type::TypeId::SMALLINT, false}; - TestInputIntegral(smallint, codegen::ValuesRuntime::InputSmallInt); + TestInputIntegral(smallint, + function::NumericFunctions::InputSmallInt); codegen::type::Type integer{type::TypeId::INTEGER, false}; - TestInputIntegral(integer, codegen::ValuesRuntime::InputInteger); + TestInputIntegral(integer, function::NumericFunctions::InputInteger); codegen::type::Type bigint{type::TypeId::BIGINT, false}; - TestInputIntegral(bigint, codegen::ValuesRuntime::InputBigInt); + TestInputIntegral(bigint, function::NumericFunctions::InputBigInt); } } // namespace test diff --git a/test/function/decimal_functions_test.cpp b/test/function/decimal_functions_test.cpp index 994523b732f..1ef4f7cd87c 100644 --- a/test/function/decimal_functions_test.cpp +++ b/test/function/decimal_functions_test.cpp @@ -17,7 +17,7 @@ #include "common/harness.h" -#include "function/decimal_functions.h" +#include "function/numeric_functions.h" #include "common/internal_types.h" #include "type/value.h" #include "type/value_factory.h" @@ -37,13 +37,13 @@ TEST_F(DecimalFunctionsTests, SqrtTest) { std::vector args = { type::ValueFactory::GetDecimalValue(column_val)}; - auto result = function::DecimalFunctions::Sqrt(args); + auto result = function::NumericFunctions::Sqrt(args); EXPECT_FALSE(result.IsNull()); EXPECT_EQ(expected, result.GetAs()); // NULL CHECK args = {type::ValueFactory::GetNullValueByType(type::TypeId::DECIMAL)}; - result = function::DecimalFunctions::Sqrt(args); + result = function::NumericFunctions::Sqrt(args); EXPECT_TRUE(result.IsNull()); } @@ -53,7 +53,7 @@ TEST_F(DecimalFunctionsTests, FloorTest) { std::vector args; for (double in : inputs) { args = {type::ValueFactory::GetDecimalValue(in)}; - auto result = function::DecimalFunctions::_Floor(args); + auto result = function::NumericFunctions::_Floor(args); EXPECT_FALSE(result.IsNull()); EXPECT_EQ(floor(in), result.GetAs()); } @@ -61,31 +61,31 @@ TEST_F(DecimalFunctionsTests, FloorTest) { // Testing Floor with Integer Types(Should be a no-op) int64_t numInt64 = 1; args = {type::ValueFactory::GetIntegerValue(numInt64)}; - auto result = function::DecimalFunctions::_Floor(args); + auto result = function::NumericFunctions::_Floor(args); EXPECT_FALSE(result.IsNull()); EXPECT_EQ(numInt64, result.GetAs()); int32_t numInt32 = 1; args = {type::ValueFactory::GetIntegerValue(numInt32)}; - result = function::DecimalFunctions::_Floor(args); + result = function::NumericFunctions::_Floor(args); EXPECT_FALSE(result.IsNull()); EXPECT_EQ(numInt32, result.GetAs()); int16_t numInt16 = 1; args = {type::ValueFactory::GetIntegerValue(numInt32)}; - result = function::DecimalFunctions::_Floor(args); + result = function::NumericFunctions::_Floor(args); EXPECT_FALSE(result.IsNull()); EXPECT_EQ(numInt16, result.GetAs()); int16_t numInt8 = 1; args = {type::ValueFactory::GetIntegerValue(numInt8)}; - result = function::DecimalFunctions::_Floor(args); + result = function::NumericFunctions::_Floor(args); EXPECT_FALSE(result.IsNull()); EXPECT_EQ(numInt8, result.GetAs()); // NULL CHECK args = {type::ValueFactory::GetNullValueByType(type::TypeId::DECIMAL)}; - result = function::DecimalFunctions::_Floor(args); + result = function::NumericFunctions::_Floor(args); EXPECT_TRUE(result.IsNull()); } @@ -94,14 +94,14 @@ TEST_F(DecimalFunctionsTests, RoundTest) { std::vector args; for (double val : column_vals) { args = {type::ValueFactory::GetDecimalValue(val)}; - auto result = function::DecimalFunctions::_Round(args); + auto result = function::NumericFunctions::_Round(args); EXPECT_FALSE(result.IsNull()); EXPECT_EQ(round(val), result.GetAs()); } // NULL CHECK args = {type::ValueFactory::GetNullValueByType(type::TypeId::DECIMAL)}; - auto result = function::DecimalFunctions::_Round(args); + auto result = function::NumericFunctions::_Round(args); EXPECT_TRUE(result.IsNull()); } @@ -110,14 +110,14 @@ TEST_F(DecimalFunctionsTests,AbsTestDouble) { std::vector args; for (double in : doubleTestInputs) { args = {type::ValueFactory::GetDecimalValue(in)}; - auto result = function::DecimalFunctions::_Abs(args); + auto result = function::NumericFunctions::_Abs(args); EXPECT_FALSE(result.IsNull()); EXPECT_EQ(fabs(in), result.GetAs()); } // NULL CHECK args = {type::ValueFactory::GetNullValueByType(type::TypeId::DECIMAL)}; - auto result = function::DecimalFunctions::_Abs(args); + auto result = function::NumericFunctions::_Abs(args); EXPECT_TRUE(result.IsNull()); } @@ -131,28 +131,28 @@ TEST_F(DecimalFunctionsTests, AbsTestInt) { // Testing Abs with Integer Types for (int64_t in: bigIntTestInputs) { args = {type::ValueFactory::GetBigIntValue(in)}; - auto result = function::DecimalFunctions::_Abs(args); + auto result = function::NumericFunctions::_Abs(args); EXPECT_FALSE(result.IsNull()); EXPECT_EQ(std::abs(in), result.GetAs()); } for (int32_t in: intTestInputs) { args = {type::ValueFactory::GetIntegerValue(in)}; - auto result = function::DecimalFunctions::_Abs(args); + auto result = function::NumericFunctions::_Abs(args); EXPECT_FALSE(result.IsNull()); EXPECT_EQ(abs(in), result.GetAs()); } for (int16_t in: smallIntTestInputs) { args = {type::ValueFactory::GetSmallIntValue(in)}; - auto result = function::DecimalFunctions::_Abs(args); + auto result = function::NumericFunctions::_Abs(args); EXPECT_FALSE(result.IsNull()); EXPECT_EQ(abs(in), result.GetAs()); } for (int8_t in: tinyIntTestInputs) { args = {type::ValueFactory::GetTinyIntValue(in)}; - auto result = function::DecimalFunctions::_Abs(args); + auto result = function::NumericFunctions::_Abs(args); EXPECT_FALSE(result.IsNull()); EXPECT_EQ(abs(in), result.GetAs()); } @@ -165,13 +165,13 @@ TEST_F(DecimalFunctionsTests, CeilTestDouble) { std::vector args; for (double in: doubleTestInputs) { args = {type::ValueFactory::GetDecimalValue(in)}; - auto result = function::DecimalFunctions::_Ceil(args); + auto result = function::NumericFunctions::_Ceil(args); EXPECT_FALSE(result.IsNull()); EXPECT_EQ(ceil(in), result.GetAs()); } args = {type::ValueFactory::GetNullValueByType(type::TypeId::DECIMAL)}; - auto result = function::DecimalFunctions::_Ceil(args); + auto result = function::NumericFunctions::_Ceil(args); EXPECT_TRUE(result.IsNull()); } @@ -185,28 +185,28 @@ TEST_F(DecimalFunctionsTests, CeilTestInt) { // Testing Ceil with Integer Types for (int64_t in: bigIntTestInputs) { args = {type::ValueFactory::GetIntegerValue(in)}; - auto result = function::DecimalFunctions::_Ceil(args); + auto result = function::NumericFunctions::_Ceil(args); EXPECT_FALSE(result.IsNull()); EXPECT_EQ(ceil(in), result.GetAs()); } for (int in: intTestInputs) { args = {type::ValueFactory::GetIntegerValue(in)}; - auto result = function::DecimalFunctions::_Ceil(args); + auto result = function::NumericFunctions::_Ceil(args); EXPECT_FALSE(result.IsNull()); EXPECT_EQ(ceil(in), result.GetAs()); } for (int in: smallIntTestInputs) { args = {type::ValueFactory::GetIntegerValue(in)}; - auto result = function::DecimalFunctions::_Ceil(args); + auto result = function::NumericFunctions::_Ceil(args); EXPECT_FALSE(result.IsNull()); EXPECT_EQ(ceil(in), result.GetAs()); } for (int in: tinyIntTestInputs) { args = {type::ValueFactory::GetIntegerValue(in)}; - auto result = function::DecimalFunctions::_Ceil(args); + auto result = function::NumericFunctions::_Ceil(args); EXPECT_FALSE(result.IsNull()); EXPECT_EQ(ceil(in), result.GetAs()); } From e76ea692c6b271bd7aaa23fe595928d39571531a Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Wed, 16 May 2018 00:37:08 -0400 Subject: [PATCH 25/42] REALLY simple Date support --- src/function/date_functions.cpp | 157 +++++++++++++++++++++++++- src/include/function/date_functions.h | 130 ++++++++++++--------- src/type/date_type.cpp | 89 ++++++--------- 3 files changed, 264 insertions(+), 112 deletions(-) diff --git a/src/function/date_functions.cpp b/src/function/date_functions.cpp index 9c676690a13..ad4681b7714 100644 --- a/src/function/date_functions.cpp +++ b/src/function/date_functions.cpp @@ -6,7 +6,7 @@ // // Identification: src/function/date_functions.cpp // -// Copyright (c) 2015-2017, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -14,10 +14,9 @@ #include #include -#include -#include #include +#include "codegen/runtime_functions.h" #include "common/internal_types.h" #include "type/value.h" #include "type/value_factory.h" @@ -63,10 +62,156 @@ type::Value DateFunctions::_Now( return type::ValueFactory::GetTimestampValue(Now()); } +int32_t DateFunctions::DateToJulian(int32_t year, int32_t month, int32_t day) { + // From Postgres date2j() + + if (month > 2) { + month += 1; + year += 4800; + } else { + month += 13; + year += 4799; + } + + int32_t century = year / 100; + + int32_t julian = year * 365 - 32167; + julian += year / 4 - century + century / 4; + julian += 7834 * month / 256 + day; + + return julian; +} + +void DateFunctions::JulianToDate(int32_t julian_date, int32_t &year, int32_t &month, + int32_t &day) { + // From Postgres j2date() + + uint32_t julian = static_cast(julian_date); + julian += 32044; + + uint32_t quad = julian / 146097; + + uint32_t extra = (julian - quad * 146097) * 4 + 3; + julian += 60 + quad * 3 + extra / 146097; + quad = julian / 1461; + julian -= quad * 1461; + + int32_t y = julian * 4 / 1461; + julian = ((y != 0) ? (julian + 305) % 365 : (julian + 306) % 366) + 123; + y += quad * 4; + + // Set year + year = static_cast(y - 4800); + quad = julian * 2141 / 65536; + + // Set day + day = julian - 7834 * quad / 256; + + // Set month + month = (quad + 10) % 12 + 1; +} + +namespace { + +template +bool TryParseInt(const char *&data, const char *end, T &out) { + static_assert(std::is_integral::value, + "ParseInt() must only be called with integer types"); + + // Initialize + out = 0; + + // Trim leading whitespace + while (*data == ' ') { + data++; + } + + // Return if no more data + if (data == end) { + return false; + } + + const char *snapshot = data; + while (data != end) { + if (*data < '0' || *data > '9') { + // Not a valid integer, stop + break; + } + + // Update running sum + out = (out * 10) + (*data - '0'); + + // Move along + data++; + } + + return snapshot != data; +} + +} // namespace + int32_t DateFunctions::InputDate( - UNUSED_ATTRIBUTE const codegen::type::Type &type, - UNUSED_ATTRIBUTE const char *data, UNUSED_ATTRIBUTE uint32_t len) { - return 0; + UNUSED_ATTRIBUTE const codegen::type::Type &type, const char *data, + uint32_t len) { + // Okay, Postgres supports a crap-tonne of different date-time and timestamp + // formats. I don't want to spend time implementing them all. For now, let's + // cover the most common formats: yyyy-mm-dd + + const char *curr_ptr = data; + const char *end = data + len; + + uint32_t nums[3] = {0, 0, 0}; + uint32_t year, month, day; + + for (uint32_t i = 0; i < 3; i++) { + bool parsed = TryParseInt(curr_ptr, end, nums[i]); + + bool unexpected_next_char = (*curr_ptr != '-' && *curr_ptr != '/'); + if (!parsed || (i != 2 && unexpected_next_char)) { + goto unsupported; + } + + curr_ptr++; + } + + // Looks okay ... let's check the components. + year = nums[0], month = nums[1], day = nums[2]; + + if (month == 0 || month > 12 || day == 0 || day > 31) { + goto unsupported; + } + + switch (month) { + case 2: { + uint32_t days_in_feb = + ((year % 4 == 0 && year % 100 != 0) || (year % 400 == 0)) ? 29 : 28; + if (day > days_in_feb) { + goto unsupported; + } + break; + } + case 4: + case 6: + case 9: + case 11: { + if (day > 30) { + goto unsupported; + } + break; + } + default: { + if (day > 31) { + goto unsupported; + } + break; + } + } + + return DateToJulian(year, month, day); + +unsupported: + codegen::RuntimeFunctions::ThrowInvalidInputStringException(); + __builtin_unreachable(); } } // namespace expression diff --git a/src/include/function/date_functions.h b/src/include/function/date_functions.h index fc5973ac95b..73e95a512a5 100644 --- a/src/include/function/date_functions.h +++ b/src/include/function/date_functions.h @@ -1,54 +1,76 @@ -//===----------------------------------------------------------------------===// -// -// Peloton -// -// date_functions.h -// -// Identification: src/include/function/date_functions.h -// -// Copyright (c) 2015-2017, Carnegie Mellon University Database Group -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include -#include - -#include "common/logger.h" -#include "common/internal_types.h" -#include "type/value.h" - -namespace peloton { - -namespace codegen { -namespace type { -class Type; -} // namespace type -} // namespace codegen - -namespace function { - -class DateFunctions { - public: - /** - * Function used to return the current date/time. Normally called at the start - * of a transaction, and consistent throughout its duration. - * - * @return The current date at the time of invocation - */ - static int64_t Now(); - static type::Value _Now(const std::vector &args); - - /** - * - * @param data - * @param len - * @return - */ - static int32_t InputDate(const codegen::type::Type &type, const char *data, - uint32_t len); -}; - -} // namespace function -} // namespace peloton +//===----------------------------------------------------------------------===// +// +// Peloton +// +// date_functions.h +// +// Identification: src/include/function/date_functions.h +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +#include "type/value.h" + +namespace peloton { + +namespace codegen { +namespace type { +class Type; +} // namespace type +} // namespace codegen + +namespace function { + +class DateFunctions { + public: + /** + * Function used to return the current date/time. Normally called at the start + * of a transaction, and consistent throughout its duration. + * + * @return The current date at the time of invocation + */ + static int64_t Now(); + static type::Value _Now(const std::vector &args); + + /** + * Convert the given input into a Julian date format. + * + * @param year The year + * @param month The month (1-based) + * @param day The day (1-based) + * @return The equivalent 32-bit integer representation of the date + */ + static int32_t DateToJulian(int32_t year, int32_t month, int32_t day); + + /** + * Decompose the given 32-bit Julian date value into year, month, and day + * components. + * + * @param julian_date The julian date + * @param year[out] Where the year is written + * @param month[out] Where the result month is written + * @param day[out] Where the result day is written + */ + static void JulianToDate(int32_t julian_date, int32_t &year, int32_t &month, + int32_t &day); + + /** + * Convert the given input string into a date. + * + * @param data A pointer to a string representation of a date + * @param len The length of the string + * @return A suitable date representation of the given input string that can + * be stored in the data tables. This typically means a Julian date. + */ + static int32_t InputDate(const codegen::type::Type &type, const char *data, + uint32_t len); +}; + +} // namespace function +} // namespace peloton diff --git a/src/type/date_type.cpp b/src/type/date_type.cpp index d99617178f4..86e9f8b7af6 100644 --- a/src/type/date_type.cpp +++ b/src/type/date_type.cpp @@ -6,12 +6,13 @@ // // Identification: src/type/date_type.cpp // -// Copyright (c) 2015-16, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// #include "type/date_type.h" +#include "function/date_functions.h" #include "type/value_factory.h" namespace peloton { @@ -19,122 +20,106 @@ namespace type { DateType::DateType() : Type(TypeId::DATE) {} -CmpBool DateType::CompareEquals(const Value& left, const Value& right) const { +CmpBool DateType::CompareEquals(const Value &left, const Value &right) const { PELOTON_ASSERT(left.CheckComparable(right)); if (left.IsNull() || right.IsNull()) return CmpBool::NULL_; return GetCmpBool(left.GetAs() == right.GetAs()); } -CmpBool DateType::CompareNotEquals(const Value& left, - const Value& right) const { +CmpBool DateType::CompareNotEquals(const Value &left, + const Value &right) const { PELOTON_ASSERT(left.CheckComparable(right)); if (right.IsNull()) return CmpBool::NULL_; return GetCmpBool(left.GetAs() != right.GetAs()); } -CmpBool DateType::CompareLessThan(const Value& left, const Value& right) const { +CmpBool DateType::CompareLessThan(const Value &left, const Value &right) const { PELOTON_ASSERT(left.CheckComparable(right)); if (left.IsNull() || right.IsNull()) return CmpBool::NULL_; return GetCmpBool(left.GetAs() < right.GetAs()); } -CmpBool DateType::CompareLessThanEquals(const Value& left, - const Value& right) const { +CmpBool DateType::CompareLessThanEquals(const Value &left, + const Value &right) const { PELOTON_ASSERT(left.CheckComparable(right)); if (left.IsNull() || right.IsNull()) return CmpBool::NULL_; return GetCmpBool(left.GetAs() <= right.GetAs()); } -CmpBool DateType::CompareGreaterThan(const Value& left, - const Value& right) const { +CmpBool DateType::CompareGreaterThan(const Value &left, + const Value &right) const { PELOTON_ASSERT(left.CheckComparable(right)); if (left.IsNull() || right.IsNull()) return CmpBool::NULL_; return GetCmpBool(left.GetAs() > right.GetAs()); } -CmpBool DateType::CompareGreaterThanEquals(const Value& left, - const Value& right) const { +CmpBool DateType::CompareGreaterThanEquals(const Value &left, + const Value &right) const { PELOTON_ASSERT(left.CheckComparable(right)); if (left.IsNull() || right.IsNull()) return CmpBool::NULL_; return GetCmpBool(left.GetAs() >= right.GetAs()); } -Value DateType::Min(const Value& left, const Value& right) const { +Value DateType::Min(const Value &left, const Value &right) const { PELOTON_ASSERT(left.CheckComparable(right)); if (left.IsNull() || right.IsNull()) return left.OperateNull(right); if (left.CompareLessThan(right) == CmpBool::CmpTrue) return left.Copy(); return right.Copy(); } -Value DateType::Max(const Value& left, const Value& right) const { +Value DateType::Max(const Value &left, const Value &right) const { PELOTON_ASSERT(left.CheckComparable(right)); if (left.IsNull() || right.IsNull()) return left.OperateNull(right); if (left.CompareGreaterThan(right) == CmpBool::CmpTrue) return left.Copy(); return right.Copy(); } -// Debug -std::string DateType::ToString(const Value& val) const { - if (val.IsNull()) return "date_null"; - int32_t tm = val.value_.date; - tm /= 1000000; - tm /= 100000; - uint16_t year = tm % 10000; - tm /= 10000; - int tz = tm % 27; - tz -= 12; - tm /= 27; - uint16_t day = tm % 32; - tm /= 32; - uint16_t month = tm; - char str[30]; - char zone[5]; - sprintf(str, "%04d-%02d-%02d", year, month, day); - if (tz >= 0) { - str[26] = '+'; - } else - str[26] = '-'; - if (tz < 0) tz = -tz; - sprintf(zone, "%02d", tz); - str[27] = 0; - return std::string(std::string(str) + std::string(zone)); +std::string DateType::ToString(const Value &val) const { + // Null + if (val.IsNull()) { + return "date_null"; + } + + int32_t year, month, day; + function::DateFunctions::JulianToDate(val.value_.date, year, month, day); + return StringUtil::Format("%04d-%02d-%02d", year, month, day); } // Compute a hash value -size_t DateType::Hash(const Value& val) const { +size_t DateType::Hash(const Value &val) const { return std::hash{}(val.value_.date); } -void DateType::HashCombine(const Value& val, size_t& seed) const { +void DateType::HashCombine(const Value &val, size_t &seed) const { val.hash_combine(seed, val.value_.date); } -void DateType::SerializeTo(const Value& val, SerializeOutput& out) const { +void DateType::SerializeTo(const Value &val, SerializeOutput &out) const { out.WriteInt(val.value_.date); } -void DateType::SerializeTo(const Value& val, char* storage, +void DateType::SerializeTo(const Value &val, char *storage, bool inlined UNUSED_ATTRIBUTE, - AbstractPool* pool UNUSED_ATTRIBUTE) const { - *reinterpret_cast(storage) = val.value_.date; + AbstractPool *pool UNUSED_ATTRIBUTE) const { + *reinterpret_cast(storage) = val.value_.date; } // Deserialize a value of the given type from the given storage space. -Value DateType::DeserializeFrom(const char* storage, +Value DateType::DeserializeFrom(const char *storage, const bool inlined UNUSED_ATTRIBUTE, - AbstractPool* pool UNUSED_ATTRIBUTE) const { - int32_t val = *reinterpret_cast(storage); + AbstractPool *pool UNUSED_ATTRIBUTE) const { + int32_t val = *reinterpret_cast(storage); return Value(type_id_, static_cast(val)); } -Value DateType::DeserializeFrom(SerializeInput& in UNUSED_ATTRIBUTE, - AbstractPool* pool UNUSED_ATTRIBUTE) const { +Value DateType::DeserializeFrom(SerializeInput &in UNUSED_ATTRIBUTE, + AbstractPool *pool UNUSED_ATTRIBUTE) const { return Value(type_id_, in.ReadInt()); } // Create a copy of this value -Value DateType::Copy(const Value& val) const { return Value(val); } +Value DateType::Copy(const Value &val) const { return Value(val); } -Value DateType::CastAs(const Value& val, const TypeId type_id) const { +Value DateType::CastAs(const Value &val, const TypeId type_id) const { switch (type_id) { case TypeId::DATE: return Copy(val); @@ -144,7 +129,7 @@ Value DateType::CastAs(const Value& val, const TypeId type_id) const { default: break; } - throw Exception("Date is not coercable to " + + throw Exception("Date is not coercible to " + Type::GetInstance(type_id)->ToString()); } From c4ede0aa1ee306005a69add2c61dc8cfb63e6468 Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Wed, 16 May 2018 01:00:02 -0400 Subject: [PATCH 26/42] Compile fixes for GCC 6+ --- src/codegen/codegen.cpp | 18 +++++++++++++++++- src/function/date_functions.cpp | 4 ++-- src/include/common/macros.h | 15 ++++----------- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp index e0082f7d588..0f8b426b61c 100644 --- a/src/codegen/codegen.cpp +++ b/src/codegen/codegen.cpp @@ -163,16 +163,23 @@ llvm::Value *CodeGen::Printf(const std::string &format, const std::vector &args) { auto *printf_fn = LookupBuiltin("printf"); if (printf_fn == nullptr) { +#if GCC_AT_LEAST_6 +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wignored-attributes" +#endif printf_fn = RegisterBuiltin( "printf", llvm::TypeBuilder::get(GetContext()), reinterpret_cast(printf)); +#if GCC_AT_LEAST_6 +#pragma GCC diagnostic pop +#endif } // Collect all the arguments into a vector std::vector printf_args = {ConstString(format, "format")}; printf_args.insert(printf_args.end(), args.begin(), args.end()); - // Call the function + // Call printf() return CallFunc(printf_fn, printf_args); } @@ -181,11 +188,20 @@ llvm::Value *CodeGen::Memcmp(llvm::Value *ptr1, llvm::Value *ptr2, static constexpr char kMemcmpFnName[] = "memcmp"; auto *memcmp_fn = LookupBuiltin(kMemcmpFnName); if (memcmp_fn == nullptr) { +#if GCC_AT_LEAST_6 +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wignored-attributes" +#endif memcmp_fn = RegisterBuiltin( kMemcmpFnName, llvm::TypeBuilder::get(GetContext()), reinterpret_cast(printf)); +#if GCC_AT_LEAST_6 +#pragma GCC diagnostic pop +#endif } + + // Call memcmp() return CallFunc(memcmp_fn, {ptr1, ptr2, len}); } diff --git a/src/function/date_functions.cpp b/src/function/date_functions.cpp index ad4681b7714..ac37f21492c 100644 --- a/src/function/date_functions.cpp +++ b/src/function/date_functions.cpp @@ -82,8 +82,8 @@ int32_t DateFunctions::DateToJulian(int32_t year, int32_t month, int32_t day) { return julian; } -void DateFunctions::JulianToDate(int32_t julian_date, int32_t &year, int32_t &month, - int32_t &day) { +void DateFunctions::JulianToDate(int32_t julian_date, int32_t &year, + int32_t &month, int32_t &day) { // From Postgres j2date() uint32_t julian = static_cast(julian_date); diff --git a/src/include/common/macros.h b/src/include/common/macros.h index e7f2dc95008..96aaf6ab0d2 100644 --- a/src/include/common/macros.h +++ b/src/include/common/macros.h @@ -97,20 +97,13 @@ namespace peloton { #endif /* CHECK_INVARIANTS */ //===--------------------------------------------------------------------===// -// override +// Compiler version checks //===--------------------------------------------------------------------===// -#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7) -#define GCC_AT_LEAST_47 1 +#if __GNUC__ > 6 || (__GNUC__ == 6 && __GNUC_MINOR__ >= 0) +#define GCC_AT_LEAST_6 1 #else -#define GCC_AT_LEAST_47 0 -#endif - -// g++-4.6 does not support override -#if GCC_AT_LEAST_47 -#define OVERRIDE override -#else -#define OVERRIDE +#define GCC_AT_LEAST_6 0 #endif //===--------------------------------------------------------------------===// From c50b665b3ad485416b79dfe8400188a6434a14f8 Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Wed, 16 May 2018 02:26:04 -0400 Subject: [PATCH 27/42] Get string inputs working --- src/codegen/operator/csv_scan_translator.cpp | 12 ++++++++++-- src/codegen/proxy/string_functions_proxy.cpp | 2 ++ src/codegen/type/varchar_type.cpp | 5 ++--- src/codegen/util/csv_scanner.cpp | 13 ++++++++----- src/function/string_functions.cpp | 8 ++++++++ .../codegen/proxy/string_functions_proxy.h | 1 + src/include/function/string_functions.h | 16 ++++++++++++++++ src/traffic_cop/traffic_cop.cpp | 5 +++++ 8 files changed, 52 insertions(+), 10 deletions(-) diff --git a/src/codegen/operator/csv_scan_translator.cpp b/src/codegen/operator/csv_scan_translator.cpp index 8603a043e89..9e8880f70c0 100644 --- a/src/codegen/operator/csv_scan_translator.cpp +++ b/src/codegen/operator/csv_scan_translator.cpp @@ -132,8 +132,16 @@ class CSVColumnAccess : public RowBatch::AttributeAccess { llvm::Value *data_len) const { auto *input_func = SqlType().GetInputFunction(codegen, ai_->type); auto *raw_val = codegen.CallFunc(input_func, {type, data_ptr, data_len}); - return codegen::Value{ai_->type, raw_val, nullptr, - codegen.ConstBool(false)}; + if (SqlType().IsVariableLength()) { + // StrWithLen + llvm::Value *str_ptr = codegen->CreateExtractValue(raw_val, 0); + llvm::Value *str_len = codegen->CreateExtractValue(raw_val, 1); + return codegen::Value{ai_->type, str_ptr, str_len, + codegen.ConstBool(false)}; + } else { + return codegen::Value{ai_->type, raw_val, nullptr, + codegen.ConstBool(false)}; + } } Value Access(CodeGen &codegen, UNUSED_ATTRIBUTE RowBatch::Row &row) override { diff --git a/src/codegen/proxy/string_functions_proxy.cpp b/src/codegen/proxy/string_functions_proxy.cpp index db765480e9b..bc0a6bce6fd 100644 --- a/src/codegen/proxy/string_functions_proxy.cpp +++ b/src/codegen/proxy/string_functions_proxy.cpp @@ -14,6 +14,7 @@ #include "codegen/proxy/executor_context_proxy.h" #include "codegen/proxy/pool_proxy.h" +#include "codegen/proxy/runtime_functions_proxy.h" namespace peloton { namespace codegen { @@ -32,6 +33,7 @@ DEFINE_METHOD(peloton::function, StringFunctions, Substr); DEFINE_METHOD(peloton::function, StringFunctions, Repeat); DEFINE_METHOD(peloton::function, StringFunctions, CompareStrings); DEFINE_METHOD(peloton::function, StringFunctions, WriteString); +DEFINE_METHOD(peloton::function, StringFunctions, InputString); } // namespace codegen } // namespace peloton diff --git a/src/codegen/type/varchar_type.cpp b/src/codegen/type/varchar_type.cpp index dc3ab961f3d..f786bc83945 100644 --- a/src/codegen/type/varchar_type.cpp +++ b/src/codegen/type/varchar_type.cpp @@ -597,9 +597,8 @@ void Varchar::GetTypeForMaterialization(CodeGen &codegen, llvm::Type *&val_type, } llvm::Function *Varchar::GetInputFunction( - UNUSED_ATTRIBUTE CodeGen &codegen, - UNUSED_ATTRIBUTE const Type &type) const { - throw NotImplementedException{"String input not implemented yet"}; + CodeGen &codegen, UNUSED_ATTRIBUTE const Type &type) const { + return StringFunctionsProxy::InputString.GetFunction(codegen); } llvm::Function *Varchar::GetOutputFunction( diff --git a/src/codegen/util/csv_scanner.cpp b/src/codegen/util/csv_scanner.cpp index da606fcaac3..0481a4444e1 100644 --- a/src/codegen/util/csv_scanner.cpp +++ b/src/codegen/util/csv_scanner.cpp @@ -197,6 +197,9 @@ const char *CSVScanner::NextLine() { uint32_t line_end = buffer_begin_; + char quote = quote_; + char escape = (quote_ == escape_ ? static_cast('\0') : escape_); + while (true) { if (line_end >= buffer_end_) { // We need to read more data from the CSV file. But first, we need to copy @@ -219,13 +222,13 @@ const char *CSVScanner::NextLine() { // Read character char c = buffer_[line_end]; - if (in_quote && c == escape_) { - last_was_escape = true; + if (in_quote && c == escape) { + last_was_escape = !last_was_escape; } - if (c == quote_ && !last_was_escape) { - in_quote = true; + if (c == quote && !last_was_escape) { + in_quote = !in_quote; } - if (c != escape_) { + if (c != escape) { last_was_escape = false; } diff --git a/src/function/string_functions.cpp b/src/function/string_functions.cpp index 2cf8d9f0b89..75af3a67523 100644 --- a/src/function/string_functions.cpp +++ b/src/function/string_functions.cpp @@ -245,5 +245,13 @@ void StringFunctions::WriteString(const char *data, uint32_t len, char *buf, *reinterpret_cast(buf) = area; } +// TODO(pmenon): UTF8 checking, string checking, lots of error handling here +// TODO(pmenon): Why do we need this +1 on the length ? +StringFunctions::StrWithLen StringFunctions::InputString( + UNUSED_ATTRIBUTE const codegen::type::Type &type, const char *data, + uint32_t len) { + return StringFunctions::StrWithLen{data, len + 1}; +} + } // namespace function } // namespace peloton diff --git a/src/include/codegen/proxy/string_functions_proxy.h b/src/include/codegen/proxy/string_functions_proxy.h index bff1911d0a2..27a24995e3a 100644 --- a/src/include/codegen/proxy/string_functions_proxy.h +++ b/src/include/codegen/proxy/string_functions_proxy.h @@ -31,6 +31,7 @@ PROXY(StringFunctions) { DECLARE_METHOD(Repeat); DECLARE_METHOD(CompareStrings); DECLARE_METHOD(WriteString); + DECLARE_METHOD(InputString); }; PROXY(StrWithLen) { diff --git a/src/include/function/string_functions.h b/src/include/function/string_functions.h index db79dc2409e..47c72c62e15 100644 --- a/src/include/function/string_functions.h +++ b/src/include/function/string_functions.h @@ -16,6 +16,12 @@ namespace peloton { +namespace codegen { +namespace type { +class Type; +} // namespace type +} // namespace codegen + namespace executor { class ExecutorContext; } // namespace executor @@ -104,6 +110,16 @@ class StringFunctions { */ static void WriteString(const char *data, uint32_t len, char *buf, peloton::type::AbstractPool &pool); + + /** + * + * @param type + * @param data + * @param len + * @return + */ + static StrWithLen InputString(const codegen::type::Type &type, + const char *data, uint32_t len); }; } // namespace function diff --git a/src/traffic_cop/traffic_cop.cpp b/src/traffic_cop/traffic_cop.cpp index a87d99c0ac5..7bfffebb4c0 100644 --- a/src/traffic_cop/traffic_cop.cpp +++ b/src/traffic_cop/traffic_cop.cpp @@ -523,6 +523,11 @@ FieldInfo TrafficCop::GetColumnFieldForValueType(std::string column_name, field_size = 255; break; } + case type::TypeId::DATE: { + field_type = PostgresValueType::DATE; + field_size = 4; + break; + } case type::TypeId::TIMESTAMP: { field_type = PostgresValueType::TIMESTAMPS; field_size = 64; // FIXME: Bytes??? From d6bb8738d4834fe36815e451da23c2ebc62607b2 Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Wed, 16 May 2018 12:11:04 -0400 Subject: [PATCH 28/42] Beefed up tests --- src/function/numeric_functions.cpp | 74 ++++++++++++++------------- src/include/index/bwtree.h | 2 +- test/codegen/value_integrity_test.cpp | 73 +++++++++++++++++++++++--- 3 files changed, 107 insertions(+), 42 deletions(-) diff --git a/src/function/numeric_functions.cpp b/src/function/numeric_functions.cpp index 50a00ee516a..f4a943c8ce0 100644 --- a/src/function/numeric_functions.cpp +++ b/src/function/numeric_functions.cpp @@ -182,24 +182,6 @@ type::Value NumericFunctions::_Round(const std::vector &args) { namespace { -/** - * Skip all leading and trailing whitespace from the string bounded by the - * provided pointers. This function will modify the input pointers to point to - * the first non-whitespace space character at the start and end of the input - * string. - * - * @param[in,out] left Pointer to the left-most character in the input string - * @param[in,out] right Pointer to the right-most character in the input string - */ -void TrimLeftRight(const char *&left, const char *&right) { - while (*left == ' ') { - left++; - } - while (right > left && *(right - 1) == ' ') { - right--; - } -} - /** * Convert the provided input string into an integral number. This function * handles leading whitespace and leading negative (-) or positive (+) signs. @@ -216,16 +198,13 @@ T ParseInteger(const char *ptr, uint32_t len) { static_assert(std::is_integral::value, "Must provide integer-type when calling ParseInteger"); - if (len == 0) { - codegen::RuntimeFunctions::ThrowInvalidInputStringException(); - __builtin_unreachable(); - } - const char *start = ptr; const char *end = start + len; - // Trim leading and trailing whitespace - TrimLeftRight(start, end); + // Trim leading whitespace + while (start < end && *start == ' ') { + start++; + } // Check negative or positive sign bool negative = false; @@ -238,10 +217,9 @@ T ParseInteger(const char *ptr, uint32_t len) { // Convert int64_t num = 0; - while (start != end) { + while (start < end) { if (*start < '0' || *start > '9') { - codegen::RuntimeFunctions::ThrowInvalidInputStringException(); - __builtin_unreachable(); + break; } num = (num * 10) + (*start - '0'); @@ -249,7 +227,16 @@ T ParseInteger(const char *ptr, uint32_t len) { start++; } - PELOTON_ASSERT(start == end); + // Trim trailing whitespace + while (start < end && *start == ' ') { + start++; + } + + // If we haven't consumed everything at this point, it was an invalid input + if (start < end) { + codegen::RuntimeFunctions::ThrowInvalidInputStringException(); + __builtin_unreachable(); + } // Negate number if we need to if (negative) { @@ -279,10 +266,13 @@ bool NumericFunctions::InputBoolean( __builtin_unreachable(); } - const char *start = ptr, *end = ptr + len; + const char *start = ptr; + const char *end = ptr + len; - // Trim leading and trailing whitespace - TrimLeftRight(start, end); + // Trim leading whitespace + while (start < end && *start == ' ') { + start++; + } // uint64_t trimmed_len = end - start; @@ -393,13 +383,16 @@ double NumericFunctions::InputDecimal( __builtin_unreachable(); } + const char *start = ptr; + const char *end = ptr + len; + // We don't trim because std::strtod() does the trimming for us // TODO(pmenon): Optimize me later - char *end = nullptr; - double ret = std::strtod(ptr, &end); + char *consumed_ptr = nullptr; + double ret = std::strtod(ptr, &consumed_ptr); - if (unlikely_branch(end == ptr)) { + if (unlikely_branch(consumed_ptr == start)) { if (errno == ERANGE) { codegen::RuntimeFunctions::ThrowOverflowException(); __builtin_unreachable(); @@ -409,6 +402,17 @@ double NumericFunctions::InputDecimal( } } + // Eat the rest + while (consumed_ptr < end && *consumed_ptr == ' ') { + consumed_ptr++; + } + + // If we haven't consumed everything at this point, it was an invalid input + if (consumed_ptr < end) { + codegen::RuntimeFunctions::ThrowInvalidInputStringException(); + __builtin_unreachable(); + } + // Done return ret; } diff --git a/src/include/index/bwtree.h b/src/include/index/bwtree.h index f9352aad09a..abb293f2e67 100755 --- a/src/include/index/bwtree.h +++ b/src/include/index/bwtree.h @@ -7585,7 +7585,7 @@ class BwTree : public BwTreeBase { // would always fail, until we have cleaned all epoch nodes current_epoch_p = nullptr; - LOG_TRACE("Clearing the epoch in ~EpochManager()..."); + LOG_TRACE("Clearing the epoch in ~EpochManager() ..."); // If all threads has exited then all thread counts are // 0, and therefore this should proceed way to the end diff --git a/test/codegen/value_integrity_test.cpp b/test/codegen/value_integrity_test.cpp index 97429771708..87450683afc 100644 --- a/test/codegen/value_integrity_test.cpp +++ b/test/codegen/value_integrity_test.cpp @@ -6,7 +6,7 @@ // // Identification: test/codegen/value_integrity_test.cpp // -// Copyright (c) 2015-17, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -190,8 +190,9 @@ void TestInputIntegral( extra_valid_tests.end()); // Default invalid tests - std::vector invalid_tests = {"a", "-b", "+c", " 1c", - "2d ", "3 3", "-4 4"}; + std::vector invalid_tests = {"a", "-b", "+c", " 1c", + "2d ", "3 3", "-4 4", "-5 a ", + " -6 a", " c 7 "}; invalid_tests.insert(invalid_tests.end(), extra_invalid_tests.begin(), extra_invalid_tests.end()); @@ -205,19 +206,25 @@ void TestInputIntegral( for (const auto &test : valid_tests) { auto *ptr = test.first.data(); auto len = static_cast(test.first.length()); - EXPECT_EQ(test.second, TestFunc(type, ptr, len)); + try { + EXPECT_EQ(test.second, TestFunc(type, ptr, len)); + } catch (std::exception &e) { + EXPECT_TRUE(false) << "Valid input '" << test.first << "' threw an error"; + } } for (const auto &test : invalid_tests) { auto *ptr = test.data(); auto len = static_cast(test.length()); - EXPECT_THROW(TestFunc(type, ptr, len), std::runtime_error); + EXPECT_THROW(TestFunc(type, ptr, len), std::runtime_error) + << "Input '" << test << "' was expected to throw an error, but did not"; } for (const auto &test : overflow_tests) { auto *ptr = test.data(); auto len = static_cast(test.length()); - EXPECT_THROW(TestFunc(type, ptr, len), std::overflow_error); + EXPECT_THROW(TestFunc(type, ptr, len), std::overflow_error) + << "Input '" << test << "' expected to overflow, but did not"; } } } // namespace @@ -238,5 +245,59 @@ TEST_F(ValueIntegrityTest, InputIntegralTypesTest) { TestInputIntegral(bigint, function::NumericFunctions::InputBigInt); } +TEST_F(ValueIntegrityTest, InputDecimalTypesTest) { + codegen::type::Type decimal{type::TypeId::DECIMAL, false}; + + // First check some valid cases + std::vector> valid_tests = { + {"0.0", 0.0}, + {"-1.0", -1.0}, + {"2.0", 2.0}, + {"+3.0", 3.0}, + {" 4.0", 4.0}, + {" -5.0", -5.0}, + {" +6.0", 6.0}, + {"7.0 ", 7.0}, + {"-8.0 ", -8.0}, + {" 9.0 ", 9.0}, + {" -10.0 ", -10.0}, + {" +11.0 ", 11.0}}; + + for (const auto &test_case : valid_tests) { + auto *ptr = test_case.first.data(); + auto len = static_cast(test_case.first.length()); + EXPECT_EQ(test_case.second, + function::NumericFunctions::InputDecimal(decimal, ptr, len)); + } + + // Now let's try some invalid ones. Take each valid test and randomly insert + // a character somewhere. + std::vector invalid_tests; + + std::random_device rd; + std::mt19937 rng(rd()); + + for (const auto &valid_test : valid_tests) { + auto orig = valid_test.first; + + std::uniform_int_distribution<> dist(0, orig.length()); + auto pos = dist(rng); + + auto invalid_num = orig.substr(0, pos) + "aa" + orig.substr(pos); + + invalid_tests.push_back(invalid_num); + } + + // Now check that each test throws an invalid string error + for (const auto &invalid_test : invalid_tests) { + auto *ptr = invalid_test.data(); + auto len = static_cast(invalid_test.length()); + EXPECT_THROW(function::NumericFunctions::InputDecimal(decimal, ptr, len), + std::runtime_error) + << "Input '" << invalid_test + << "' expected to throw error, but passed parsing logic"; + } +} + } // namespace test } // namespace peloton \ No newline at end of file From b34ba308471bc1e9c73d8b4b0e6b8c879869d9b5 Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Wed, 16 May 2018 14:49:51 -0400 Subject: [PATCH 29/42] Simple CSV scan test --- test/codegen/csv_scan_test.cpp | 117 +++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 test/codegen/csv_scan_test.cpp diff --git a/test/codegen/csv_scan_test.cpp b/test/codegen/csv_scan_test.cpp new file mode 100644 index 00000000000..f40fc823e80 --- /dev/null +++ b/test/codegen/csv_scan_test.cpp @@ -0,0 +1,117 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// csv_scan_test.cpp +// +// Identification: test/codegen/csv_scan_test.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include "codegen/testing_codegen_util.h" + +#include "codegen/util/csv_scanner.h" +#include "common/timer.h" +#include "util/file_util.h" + +namespace peloton { +namespace test { + +class CSVScanTest : public PelotonCodeGenTest {}; + +using CallbackFn = + std::function; + +struct State { + codegen::util::CSVScanner *scanner; + CallbackFn callback; +}; + +struct TempFileHandle { + std::string name; + TempFileHandle(std::string _name) : name(_name) {} + ~TempFileHandle() { boost::filesystem::remove(name); } +}; + +void CSVRowCallback(void *s) { + auto *state = reinterpret_cast(s); + state->callback(state->scanner->GetColumns()); +} + +void IterateAsCSV(const std::vector &rows, + const std::vector &col_types, + CallbackFn callback, char delimiter = ',') { + std::string csv_data; + for (uint32_t i = 0; i < rows.size(); i++) { + csv_data.append(rows[i]).append("\n"); + } + + // Write the contents into a temporary file + TempFileHandle fh{FileUtil::WriteTempFile(csv_data, "", "tmp")}; + + // The memory pool + auto &pool = *TestingHarness::GetInstance().GetTestingPool(); + + // The client-state + State state = {.scanner = nullptr, .callback = callback}; + + // The scanner + codegen::util::CSVScanner scanner{ + pool, fh.name, col_types.data(), static_cast(col_types.size()), + CSVRowCallback, reinterpret_cast(&state), delimiter}; + + state.scanner = &scanner; + + // Iterate! + scanner.Produce(); +} + +TEST_F(CSVScanTest, SimpleNumericScan) { + // Create a temporary CSV file + std::vector rows = {"1,2,3.0,4", "4,5,6.0,7", "8,9,10.0,11"}; + std::vector types = {{type::TypeId::INTEGER, false}, + {type::TypeId::INTEGER, false}, + {type::TypeId::DECIMAL, false}, + {type::TypeId::INTEGER, false}}; + + uint32_t rows_read = 0; + IterateAsCSV(rows, types, [&rows_read, &types]( + const codegen::util::CSVScanner::Column *cols) { + rows_read++; + for (uint32_t i = 0; i < types.size(); i++) { + EXPECT_FALSE(cols[i].is_null); + EXPECT_GT(cols[i].len, 0); + } + }); + + // Check + EXPECT_EQ(rows.size(), rows_read); +} + +TEST_F(CSVScanTest, MixedStringScan) { + // Create a temporary CSV file + std::vector rows = {"1,2,3,test", "4,5,6,\"test\"", + "8,9,10,\"test\nnewline\ninquote\""}; + std::vector types = {{type::TypeId::INTEGER, false}, + {type::TypeId::INTEGER, false}, + {type::TypeId::INTEGER, false}, + {type::TypeId::VARCHAR, false}}; + + uint32_t rows_read = 0; + IterateAsCSV(rows, types, [&rows_read, &types]( + const codegen::util::CSVScanner::Column *cols) { + rows_read++; + for (uint32_t i = 0; i < types.size(); i++) { + EXPECT_FALSE(cols[i].is_null); + EXPECT_GT(cols[i].len, 0); + } + }); + + // Check + EXPECT_EQ(rows.size(), rows_read); +} + +} // namespace test +} // namespace peloton \ No newline at end of file From 70d501275ea98b4264ab6a8203cecd83bdccce9a Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Wed, 16 May 2018 17:32:37 -0400 Subject: [PATCH 30/42] Updated optimize to continue support for old/weird/strange AF copy executor --- src/common/internal_types.cpp | 10 +- src/executor/copy_executor.cpp | 24 ++-- src/executor/plan_executor.cpp | 2 +- src/include/common/internal_types.h | 3 +- .../optimizer/child_property_deriver.h | 1 + src/include/optimizer/input_column_deriver.h | 2 + src/include/optimizer/operator_node.h | 3 +- src/include/optimizer/operator_visitor.h | 5 +- src/include/optimizer/operators.h | 27 +++- src/include/optimizer/plan_generator.h | 2 + src/include/optimizer/rule_impls.h | 15 +++ src/include/planner/copy_plan.h | 44 ------- src/include/planner/csv_scan_plan.h | 13 +- .../planner/export_external_file_plan.h | 119 ++++++++++++++++++ src/optimizer/child_property_deriver.cpp | 7 ++ src/optimizer/input_column_deriver.cpp | 4 + src/optimizer/operators.cpp | 55 +++++++- src/optimizer/optimizer_task.cpp | 16 ++- src/optimizer/plan_generator.cpp | 8 ++ .../query_to_operator_transformer.cpp | 4 +- src/optimizer/rule.cpp | 1 + src/optimizer/rule_impls.cpp | 31 +++++ src/optimizer/util.cpp | 3 - 23 files changed, 316 insertions(+), 83 deletions(-) delete mode 100644 src/include/planner/copy_plan.h create mode 100644 src/include/planner/export_external_file_plan.h diff --git a/src/common/internal_types.cpp b/src/common/internal_types.cpp index 427e9848e25..855f7ef2d9b 100644 --- a/src/common/internal_types.cpp +++ b/src/common/internal_types.cpp @@ -1382,9 +1382,6 @@ std::string PlanNodeTypeToString(PlanNodeType type) { case PlanNodeType::RESULT: { return ("RESULT"); } - case PlanNodeType::COPY: { - return ("COPY"); - } case PlanNodeType::MOCK: { return ("MOCK"); } @@ -1394,6 +1391,9 @@ std::string PlanNodeTypeToString(PlanNodeType type) { case PlanNodeType::ANALYZE: { return ("ANALYZE"); } + case PlanNodeType::EXPORT_EXTERNAL_FILE: { + return ("EXPORT_EXTERNAL_FILE"); + } default: { throw ConversionException( StringUtil::Format("No string conversion for PlanNodeType value '%d'", @@ -1461,12 +1461,12 @@ PlanNodeType StringToPlanNodeType(const std::string &str) { return PlanNodeType::HASH; } else if (upper_str == "RESULT") { return PlanNodeType::RESULT; - } else if (upper_str == "COPY") { - return PlanNodeType::COPY; } else if (upper_str == "MOCK") { return PlanNodeType::MOCK; } else if (upper_str == "ANALYZE") { return PlanNodeType::ANALYZE; + } else if (upper_str == "EXPORT_EXTERNAL_FILE") { + return PlanNodeType::EXPORT_EXTERNAL_FILE; } else { throw ConversionException(StringUtil::Format( "No PlanNodeType conversion from string '%s'", upper_str.c_str())); diff --git a/src/executor/copy_executor.cpp b/src/executor/copy_executor.cpp index e55d665bc6c..f499e899708 100644 --- a/src/executor/copy_executor.cpp +++ b/src/executor/copy_executor.cpp @@ -6,23 +6,25 @@ // // Identification: src/executor/copy_executor.cpp // -// Copyright (c) 2015-17, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// +#include "executor/copy_executor.h" + +#include +#include + #include "common/logger.h" #include "catalog/catalog.h" #include "concurrency/transaction_manager_factory.h" -#include "executor/copy_executor.h" #include "executor/executor_context.h" #include "executor/logical_tile_factory.h" -#include "planner/copy_plan.h" +#include "planner/export_external_file_plan.h" #include "storage/table_factory.h" #include "network/postgres_protocol_handler.h" #include "common/exception.h" #include "common/macros.h" -#include -#include namespace peloton { namespace executor { @@ -35,7 +37,7 @@ CopyExecutor::CopyExecutor(const planner::AbstractPlan *node, ExecutorContext *executor_context) : AbstractExecutor(node, executor_context) {} -CopyExecutor::~CopyExecutor() {} +CopyExecutor::~CopyExecutor() = default; /** * @brief Basic initialization. @@ -45,21 +47,19 @@ bool CopyExecutor::DInit() { PELOTON_ASSERT(children_.size() == 1); // Grab info from plan node and check it - const planner::CopyPlan &node = GetPlanNode(); + const auto &node = GetPlanNode(); - bool success = InitFileHandle(node.file_path.c_str(), "w"); + bool success = InitFileHandle(node.GetFileName().c_str(), "w"); if (success == false) { - throw ExecutorException("Failed to create file " + node.file_path + + throw ExecutorException("Failed to create file " + node.GetFileName() + ". Try absolute path and make sure you have the " "permission to access this file."); - return false; } - LOG_DEBUG("Created target copy output file: %s", node.file_path.c_str()); + LOG_DEBUG("Created target copy output file: %s", node.GetFileName().c_str()); return true; } - bool CopyExecutor::InitFileHandle(const char *name, const char *mode) { auto file = fopen(name, mode); if (file == NULL) { diff --git a/src/executor/plan_executor.cpp b/src/executor/plan_executor.cpp index a01330b7b6d..6226e3a26cf 100644 --- a/src/executor/plan_executor.cpp +++ b/src/executor/plan_executor.cpp @@ -339,7 +339,7 @@ executor::AbstractExecutor *BuildExecutorTree( new executor::CreateFunctionExecutor(plan, executor_context); break; - case PlanNodeType::COPY: + case PlanNodeType::EXPORT_EXTERNAL_FILE: child_executor = new executor::CopyExecutor(plan, executor_context); break; diff --git a/src/include/common/internal_types.h b/src/include/common/internal_types.h index 4654ec9bc77..22598226407 100644 --- a/src/include/common/internal_types.h +++ b/src/include/common/internal_types.h @@ -595,7 +595,7 @@ enum class PlanNodeType { // Utility RESULT = 70, - COPY = 71, + EXPORT_EXTERNAL_FILE = 71, CREATE_FUNC = 72, // Test @@ -1355,6 +1355,7 @@ enum class RuleType : uint32_t { INNER_JOIN_TO_HASH_JOIN, IMPLEMENT_DISTINCT, IMPLEMENT_LIMIT, + EXPORT_EXTERNAL_FILE_TO_PHYSICAL, // Don't move this one RewriteDelimiter, diff --git a/src/include/optimizer/child_property_deriver.h b/src/include/optimizer/child_property_deriver.h index dd887ff9af3..914cc77ab27 100644 --- a/src/include/optimizer/child_property_deriver.h +++ b/src/include/optimizer/child_property_deriver.h @@ -59,6 +59,7 @@ class ChildPropertyDeriver : public OperatorVisitor { void Visit(const PhysicalSortGroupBy *) override; void Visit(const PhysicalDistinct *) override; void Visit(const PhysicalAggregate *) override; + void Visit(const PhysicalExportExternalFile *) override; private: void DeriveForJoin(); diff --git a/src/include/optimizer/input_column_deriver.h b/src/include/optimizer/input_column_deriver.h index 728a08305c4..ef66823bba0 100644 --- a/src/include/optimizer/input_column_deriver.h +++ b/src/include/optimizer/input_column_deriver.h @@ -93,6 +93,8 @@ class InputColumnDeriver : public OperatorVisitor { void Visit(const PhysicalAggregate *) override; + void Visit(const PhysicalExportExternalFile *) override; + private: /** * @brief Provide all tuple value expressions needed in the expression diff --git a/src/include/optimizer/operator_node.h b/src/include/optimizer/operator_node.h index bfc0653518d..f870df330eb 100644 --- a/src/include/optimizer/operator_node.h +++ b/src/include/optimizer/operator_node.h @@ -72,7 +72,8 @@ enum class OpType { Update, Aggregate, HashGroupBy, - SortGroupBy + SortGroupBy, + ExportExternalFile, }; //===--------------------------------------------------------------------===// diff --git a/src/include/optimizer/operator_visitor.h b/src/include/optimizer/operator_visitor.h index 50fd98fa024..e225287cebb 100644 --- a/src/include/optimizer/operator_visitor.h +++ b/src/include/optimizer/operator_visitor.h @@ -6,7 +6,7 @@ // // Identification: src/include/optimizer/operator_visitor.h // -// Copyright (c) 2015-16, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -23,7 +23,7 @@ namespace optimizer { class OperatorVisitor { public: - virtual ~OperatorVisitor(){}; + virtual ~OperatorVisitor() = default; // Physical operator virtual void Visit(const DummyScan *) {} @@ -49,6 +49,7 @@ class OperatorVisitor { virtual void Visit(const PhysicalSortGroupBy *) {} virtual void Visit(const PhysicalDistinct *) {} virtual void Visit(const PhysicalAggregate *) {} + virtual void Visit(const PhysicalExportExternalFile *) {} // Logical operator virtual void Visit(const LeafOperator *) {} diff --git a/src/include/optimizer/operators.h b/src/include/optimizer/operators.h index 8ec891c8131..8a7c7582e56 100644 --- a/src/include/optimizer/operators.h +++ b/src/include/optimizer/operators.h @@ -325,12 +325,19 @@ class LogicalUpdate : public OperatorNode { }; //===--------------------------------------------------------------------===// -// External file get +// Export to external file //===--------------------------------------------------------------------===// class LogicalExportExternalFile : public OperatorNode { public: - static Operator make(); + static Operator make(ExternalFileFormat format, std::string file_name); + + bool operator==(const BaseOperatorNode &r) override; + + hash_t Hash() const override; + + ExternalFileFormat format; + std::string file_name; }; //===--------------------------------------------------------------------===// @@ -604,6 +611,22 @@ class PhysicalUpdate : public OperatorNode { const std::vector> *updates; }; +//===--------------------------------------------------------------------===// +// Physical ExportExternalFile +//===--------------------------------------------------------------------===// +class PhysicalExportExternalFile + : public OperatorNode { + public: + static Operator make(ExternalFileFormat format, std::string file_name); + + bool operator==(const BaseOperatorNode &r) override; + + hash_t Hash() const override; + + ExternalFileFormat format; + std::string file_name; +}; + //===--------------------------------------------------------------------===// // PhysicalHashGroupBy //===--------------------------------------------------------------------===// diff --git a/src/include/optimizer/plan_generator.h b/src/include/optimizer/plan_generator.h index 353de6db29f..9fba272d4a8 100644 --- a/src/include/optimizer/plan_generator.h +++ b/src/include/optimizer/plan_generator.h @@ -94,6 +94,8 @@ class PlanGenerator : public OperatorVisitor { void Visit(const PhysicalAggregate *) override; + void Visit(const PhysicalExportExternalFile *) override; + private: /** * @brief Generate all tuple value expressions of a base table diff --git a/src/include/optimizer/rule_impls.h b/src/include/optimizer/rule_impls.h index 5ace068138d..57902e744a9 100644 --- a/src/include/optimizer/rule_impls.h +++ b/src/include/optimizer/rule_impls.h @@ -281,6 +281,21 @@ class ImplementLimit : public Rule { OptimizeContext *context) const override; }; +/** + * @brief Logical Export to External File -> Physical Export to External file + */ +class LogicalExportToPhysicalExport : public Rule { + public: + LogicalExportToPhysicalExport(); + + bool Check(std::shared_ptr plan, + OptimizeContext *context) const override; + + void Transform(std::shared_ptr input, + std::vector> &transformed, + OptimizeContext *context) const override; +}; + //===--------------------------------------------------------------------===// // Rewrite rules //===--------------------------------------------------------------------===// diff --git a/src/include/planner/copy_plan.h b/src/include/planner/copy_plan.h deleted file mode 100644 index 082598d10af..00000000000 --- a/src/include/planner/copy_plan.h +++ /dev/null @@ -1,44 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Peloton -// -// copy_plan.h -// -// Identification: src/include/planner/copy_plan.h -// -// Copyright (c) 2015-2018, Carnegie Mellon University Database Group -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include "planner/abstract_plan.h" - -namespace peloton { - -namespace storage { -class DataTable; -} // namespace storage - -namespace planner { - -class CopyPlan : public AbstractPlan { - public: - explicit CopyPlan(std::string file_path) : file_path(std::move(file_path)) {} - - PlanNodeType GetPlanNodeType() const override { return PlanNodeType::COPY; } - - const std::string GetInfo() const override { return "CopyPlan"; } - - // TODO: Implement copy mechanism - std::unique_ptr Copy() const override { return nullptr; } - - // The path of the target file - std::string file_path; - - private: - DISALLOW_COPY_AND_MOVE(CopyPlan); -}; - -} // namespace planner -} // namespace peloton \ No newline at end of file diff --git a/src/include/planner/csv_scan_plan.h b/src/include/planner/csv_scan_plan.h index 2f40999efd0..516debcaeeb 100644 --- a/src/include/planner/csv_scan_plan.h +++ b/src/include/planner/csv_scan_plan.h @@ -138,13 +138,22 @@ inline void CSVScanPlan::GetOutputColumns(std::vector &columns) const { } inline hash_t CSVScanPlan::Hash() const { - return HashUtil::HashBytes(file_name_.data(), file_name_.length()); + hash_t hash = HashUtil::HashBytes(file_name_.data(), file_name_.length()); + hash = HashUtil::CombineHashes(hash, HashUtil::Hash(&delimiter_)); + hash = HashUtil::CombineHashes(hash, HashUtil::Hash("e_)); + hash = HashUtil::CombineHashes(hash, HashUtil::Hash(&escape_)); + hash = HashUtil::CombineHashes( + hash, HashUtil::HashBytes(null_.c_str(), null_.length())); + return hash; } inline bool CSVScanPlan::operator==(const AbstractPlan &rhs) const { if (rhs.GetPlanNodeType() != PlanNodeType::CSVSCAN) return false; const auto &other = static_cast(rhs); - return StringUtil::Upper(file_name_) == StringUtil::Upper(other.file_name_); + return ( + (StringUtil::Upper(file_name_) == StringUtil::Upper(other.file_name_)) && + delimiter_ == other.delimiter_ && quote_ == other.quote_ && + escape_ == other.escape_); } inline void CSVScanPlan::GetAttributes( diff --git a/src/include/planner/export_external_file_plan.h b/src/include/planner/export_external_file_plan.h new file mode 100644 index 00000000000..6962891a19d --- /dev/null +++ b/src/include/planner/export_external_file_plan.h @@ -0,0 +1,119 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// export_external_file_plan.h +// +// Identification: src/include/planner/export_external_file_plan.h +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "concurrency/transaction_context.h" +#include "planner/abstract_plan.h" + +namespace peloton { +namespace planner { + +class ExportExternalFilePlan : public AbstractPlan { + public: + ExportExternalFilePlan(std::string file_name, char delimiter = ',', + char quote = '"', char escape = '\"'); + + ////////////////////////////////////////////////////////////////////////////// + /// + /// Accessors + /// + ////////////////////////////////////////////////////////////////////////////// + + PlanNodeType GetPlanNodeType() const override; + + const std::string &GetFileName() const { return file_name_; } + + char GetDelimiterChar() const { return delimiter_; } + char GetQuoteChar() const { return quote_; } + char GetEscapeChar() const { return escape_; } + + ////////////////////////////////////////////////////////////////////////////// + /// + /// Utilities + Internal + /// + ////////////////////////////////////////////////////////////////////////////// + + hash_t Hash() const override; + + bool operator==(const AbstractPlan &rhs) const override; + + std::unique_ptr Copy() const override; + + void PerformBinding(BindingContext &binding_context) override; + + private: + std::vector output_attributes_; + + std::string file_name_; + + char delimiter_; + char quote_; + char escape_; +}; + +//////////////////////////////////////////////////////////////////////////////// +/// +/// Implementation below +/// +//////////////////////////////////////////////////////////////////////////////// + +inline ExportExternalFilePlan::ExportExternalFilePlan(std::string file_name, + char delimiter, + char quote, char escape) + : file_name_(file_name), + delimiter_(delimiter), + quote_(quote), + escape_(escape) {} + +inline PlanNodeType ExportExternalFilePlan::GetPlanNodeType() const { + return PlanNodeType::EXPORT_EXTERNAL_FILE; +} + +inline hash_t ExportExternalFilePlan::Hash() const { + hash_t hash = HashUtil::HashBytes(file_name_.data(), file_name_.length()); + hash = HashUtil::CombineHashes(hash, HashUtil::Hash(&delimiter_)); + hash = HashUtil::CombineHashes(hash, HashUtil::Hash("e_)); + hash = HashUtil::CombineHashes(hash, HashUtil::Hash(&escape_)); + return hash; +} + +inline bool ExportExternalFilePlan::operator==(const AbstractPlan &rhs) const { + if (rhs.GetPlanNodeType() != PlanNodeType::EXPORT_EXTERNAL_FILE) return false; + const auto &other = static_cast(rhs); + return ( + (StringUtil::Upper(file_name_) == StringUtil::Upper(other.file_name_)) && + delimiter_ == other.delimiter_ && quote_ == other.quote_ && + escape_ == other.escape_); +} + +inline std::unique_ptr ExportExternalFilePlan::Copy() const { + return std::unique_ptr{ + new ExportExternalFilePlan(file_name_, delimiter_, quote_, escape_)}; +} + +inline void ExportExternalFilePlan::PerformBinding( + BindingContext &binding_context) { + PELOTON_ASSERT(GetChildrenSize() == 1); + auto &child = *GetChild(0); + + std::vector child_output_cols; + child.GetOutputColumns(child_output_cols); + + output_attributes_.clear(); + for (const auto &col_id : child_output_cols) { + output_attributes_.push_back(binding_context.Find(col_id)); + } +} + +} // namespace planner +} // namespace peloton \ No newline at end of file diff --git a/src/optimizer/child_property_deriver.cpp b/src/optimizer/child_property_deriver.cpp index 5020302b614..39ca06d811b 100644 --- a/src/optimizer/child_property_deriver.cpp +++ b/src/optimizer/child_property_deriver.cpp @@ -193,6 +193,13 @@ void ChildPropertyDeriver::Visit(const DummyScan *) { make_pair(make_shared(), vector>())); } +void ChildPropertyDeriver::Visit(const PhysicalExportExternalFile *) { + // Let child fulfil all the required properties + vector> child_input_properties{requirements_}; + + output_.push_back(make_pair(requirements_, move(child_input_properties))); +} + void ChildPropertyDeriver::DeriveForJoin() { output_.push_back(make_pair( make_shared(), diff --git a/src/optimizer/input_column_deriver.cpp b/src/optimizer/input_column_deriver.cpp index 08d7c54a4ae..019117ae68e 100644 --- a/src/optimizer/input_column_deriver.cpp +++ b/src/optimizer/input_column_deriver.cpp @@ -157,6 +157,10 @@ void InputColumnDeriver::Visit(const PhysicalDelete *) { Passdown(); } void InputColumnDeriver::Visit(const PhysicalUpdate *) { Passdown(); } +void InputColumnDeriver::Visit(const PhysicalExportExternalFile *) { + Passdown(); +} + void InputColumnDeriver::ScanHelper() { // Scan does not have input column, output columns should contain all tuple // value expressions needed diff --git a/src/optimizer/operators.cpp b/src/optimizer/operators.cpp index c9fb133bc90..e168a4d4bea 100644 --- a/src/optimizer/operators.cpp +++ b/src/optimizer/operators.cpp @@ -445,11 +445,29 @@ Operator LogicalLimit::make(int64_t offset, int64_t limit) { //===--------------------------------------------------------------------===// // External file output //===--------------------------------------------------------------------===// -Operator LogicalExportExternalFile::make() { - auto *export_op = new LogicalExternalFileGet(); +Operator LogicalExportExternalFile::make(ExternalFileFormat format, + std::string file_name) { + auto *export_op = new LogicalExportExternalFile(); + export_op->format = format; + export_op->file_name = std::move(file_name); return Operator(export_op); } +bool LogicalExportExternalFile::operator==(const BaseOperatorNode &node) { + if (node.GetType() != OpType::LogicalExportExternalFile) return false; + const auto &export_op = + *static_cast(&node); + return (format == export_op.format && file_name == export_op.file_name); +} + +hash_t LogicalExportExternalFile::Hash() const { + hash_t hash = BaseOperatorNode::Hash(); + hash = HashUtil::CombineHashes(hash, HashUtil::Hash(&format)); + hash = HashUtil::CombineHashes( + hash, HashUtil::HashBytes(file_name.data(), file_name.length())); + return hash; +} + //===--------------------------------------------------------------------===// // DummyScan //===--------------------------------------------------------------------===// @@ -823,6 +841,32 @@ Operator PhysicalUpdate::make( return Operator(update); } +//===--------------------------------------------------------------------===// +// PhysicalExportExternalFile +//===--------------------------------------------------------------------===// +Operator PhysicalExportExternalFile::make(ExternalFileFormat format, + std::string file_name) { + auto *export_op = new PhysicalExportExternalFile(); + export_op->format = format; + export_op->file_name = file_name; + return Operator(export_op); +} + +bool PhysicalExportExternalFile::operator==(const BaseOperatorNode &node) { + if (node.GetType() != OpType::ExportExternalFile) return false; + const auto &export_op = + *static_cast(&node); + return (format == export_op.format && file_name == export_op.file_name); +} + +hash_t PhysicalExportExternalFile::Hash() const { + hash_t hash = BaseOperatorNode::Hash(); + hash = HashUtil::CombineHashes(hash, HashUtil::Hash(&format)); + hash = HashUtil::CombineHashes( + hash, HashUtil::HashBytes(file_name.data(), file_name.length())); + return hash; +} + //===--------------------------------------------------------------------===// // PhysicalHashGroupBy //===--------------------------------------------------------------------===// @@ -1005,6 +1049,9 @@ template <> std::string OperatorNode::name_ = "PhysicalDistinct"; template <> std::string OperatorNode::name_ = "PhysicalAggregate"; +template <> +std::string OperatorNode::name_ = + "PhysicalExportExternalFile"; //===--------------------------------------------------------------------===// template <> @@ -1102,7 +1149,11 @@ template <> OpType OperatorNode::type_ = OpType::SortGroupBy; template <> OpType OperatorNode::type_ = OpType::Aggregate; +template <> +OpType OperatorNode::type_ = + OpType::ExportExternalFile; //===--------------------------------------------------------------------===// + template bool OperatorNode::IsLogical() const { return type_ < OpType::LogicalPhysicalDelimiter; diff --git a/src/optimizer/optimizer_task.cpp b/src/optimizer/optimizer_task.cpp index f0a489906ae..8c430f76ae2 100644 --- a/src/optimizer/optimizer_task.cpp +++ b/src/optimizer/optimizer_task.cpp @@ -6,7 +6,7 @@ // // Identification: src/optimizer/optimizer_task.cpp // -// Copyright (c) 2015-17, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -22,6 +22,7 @@ namespace peloton { namespace optimizer { + //===--------------------------------------------------------------------===// // Base class //===--------------------------------------------------------------------===// @@ -30,13 +31,16 @@ void OptimizerTask::ConstructValidRules( std::vector> &rules, std::vector &valid_rules) { for (auto &rule : rules) { - if (group_expr->Op().GetType() != - rule->GetMatchPattern()->Type() || // Root pattern type mismatch - group_expr->HasRuleExplored(rule.get()) || // Rule has been applied + // Check if we can apply the rule + bool root_pattern_mismatch = + group_expr->Op().GetType() != rule->GetMatchPattern()->Type(); + bool already_explored = group_expr->HasRuleExplored(rule.get()); + bool child_pattern_mismatch = group_expr->GetChildrenGroupsSize() != - rule->GetMatchPattern() - ->GetChildPatternsSize()) // Children size does not math + rule->GetMatchPattern()->GetChildPatternsSize(); + if (root_pattern_mismatch || already_explored || child_pattern_mismatch) { continue; + } auto promise = rule->Promise(group_expr, context); if (promise > 0) valid_rules.emplace_back(rule.get(), promise); diff --git a/src/optimizer/plan_generator.cpp b/src/optimizer/plan_generator.cpp index 804184b6246..c2c2dcc399a 100644 --- a/src/optimizer/plan_generator.cpp +++ b/src/optimizer/plan_generator.cpp @@ -23,6 +23,7 @@ #include "planner/aggregate_plan.h" #include "planner/csv_scan_plan.h" #include "planner/delete_plan.h" +#include "planner/export_external_file_plan.h" #include "planner/hash_join_plan.h" #include "planner/hash_plan.h" #include "planner/index_scan_plan.h" @@ -385,6 +386,13 @@ void PlanGenerator::Visit(const PhysicalUpdate *op) { output_plan_ = move(update_plan); } +void PlanGenerator::Visit(const PhysicalExportExternalFile *op) { + unique_ptr export_plan{ + new planner::ExportExternalFilePlan(op->file_name)}; + export_plan->AddChild(move(children_plans_[0])); + output_plan_ = move(export_plan); +} + /************************* Private Functions *******************************/ vector> PlanGenerator::GenerateTableTVExprs( diff --git a/src/optimizer/query_to_operator_transformer.cpp b/src/optimizer/query_to_operator_transformer.cpp index 816ef24a7fb..73c52f9266e 100644 --- a/src/optimizer/query_to_operator_transformer.cpp +++ b/src/optimizer/query_to_operator_transformer.cpp @@ -386,8 +386,8 @@ void QueryToOperatorTransformer::Visit(parser::CopyStatement *op) { } else { op->table->Accept(this); } - auto export_op = - std::make_shared(LogicalExportExternalFile::make()); + auto export_op = std::make_shared( + LogicalExportExternalFile::make(op->format, op->file_path)); export_op->PushChild(output_expr_); output_expr_ = export_op; } diff --git a/src/optimizer/rule.cpp b/src/optimizer/rule.cpp index fc4bc837736..8c72ed17fa8 100644 --- a/src/optimizer/rule.cpp +++ b/src/optimizer/rule.cpp @@ -45,6 +45,7 @@ RuleSet::RuleSet() { AddImplementationRule(new InnerJoinToInnerHashJoin()); AddImplementationRule(new ImplementDistinct()); AddImplementationRule(new ImplementLimit()); + AddImplementationRule(new LogicalExportToPhysicalExport()); AddRewriteRule(RewriteRuleSetName::PREDICATE_PUSH_DOWN, new PushFilterThroughJoin()); diff --git a/src/optimizer/rule_impls.cpp b/src/optimizer/rule_impls.cpp index 284109a38f1..9d0a4624c2a 100644 --- a/src/optimizer/rule_impls.cpp +++ b/src/optimizer/rule_impls.cpp @@ -819,6 +819,37 @@ void ImplementLimit::Transform( transformed.push_back(result_plan); } +/////////////////////////////////////////////////////////////////////////////// +/// LogicalExport to Physical Export +LogicalExportToPhysicalExport::LogicalExportToPhysicalExport() { + type_ = RuleType::EXPORT_EXTERNAL_FILE_TO_PHYSICAL; + match_pattern = std::make_shared(OpType::LogicalExportExternalFile); + match_pattern->AddChild(std::make_shared(OpType::Leaf)); +} + +bool LogicalExportToPhysicalExport::Check( + UNUSED_ATTRIBUTE std::shared_ptr plan, + UNUSED_ATTRIBUTE OptimizeContext *context) const { + return true; +} + +void LogicalExportToPhysicalExport::Transform( + std::shared_ptr input, + std::vector> &transformed, + UNUSED_ATTRIBUTE OptimizeContext *context) const { + const auto *logical_export = input->Op().As(); + + auto result_plan = + std::make_shared(PhysicalExportExternalFile::make( + logical_export->format, logical_export->file_name)); + + std::vector> children = input->Children(); + PELOTON_ASSERT(children.size() == 1); + result_plan->PushChild(children[0]); + + transformed.push_back(result_plan); +} + //===--------------------------------------------------------------------===// // Rewrite rules //===--------------------------------------------------------------------===// diff --git a/src/optimizer/util.cpp b/src/optimizer/util.cpp index 4ff60ee36c8..07685376b34 100644 --- a/src/optimizer/util.cpp +++ b/src/optimizer/util.cpp @@ -15,9 +15,6 @@ #include "catalog/query_metrics_catalog.h" #include "concurrency/transaction_manager_factory.h" #include "expression/expression_util.h" -#include "planner/copy_plan.h" -#include "planner/seq_scan_plan.h" -#include "storage/data_table.h" namespace peloton { namespace optimizer { From f04d036f87e6aad2a79a51c8b5be435de1a7c085 Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Wed, 16 May 2018 17:41:58 -0400 Subject: [PATCH 31/42] Extracted implementation into CPP file for plan node --- src/include/planner/csv_scan_plan.h | 85 ++--------------- .../planner/export_external_file_plan.h | 64 ++----------- src/planner/csv_scan_plan.cpp | 93 +++++++++++++++++++ src/planner/export_external_file_plan.cpp | 70 ++++++++++++++ 4 files changed, 179 insertions(+), 133 deletions(-) create mode 100644 src/planner/csv_scan_plan.cpp create mode 100644 src/planner/export_external_file_plan.cpp diff --git a/src/include/planner/csv_scan_plan.h b/src/include/planner/csv_scan_plan.h index 516debcaeeb..2cd255884d3 100644 --- a/src/include/planner/csv_scan_plan.h +++ b/src/include/planner/csv_scan_plan.h @@ -12,15 +12,21 @@ #pragma once +#include #include +#include +#include -#include "codegen/type/type.h" #include "planner/abstract_scan_plan.h" #include "planner/attribute_info.h" +#include "type/type_id.h" namespace peloton { namespace planner { +/** + * This is the plan node when scanning a CSV file. + */ class CSVScanPlan : public AbstractScan { public: struct ColumnInfo { @@ -87,82 +93,5 @@ class CSVScanPlan : public AbstractScan { std::vector attributes_; }; -//////////////////////////////////////////////////////////////////////////////// -/// -/// Implementation below -/// -//////////////////////////////////////////////////////////////////////////////// - -inline CSVScanPlan::CSVScanPlan(std::string file_name, - std::vector &&cols, - char delimiter, char quote, char escape, - std::string null) - : file_name_(std::move(file_name)), - delimiter_(delimiter), - quote_(quote), - escape_(escape), - null_(null) { - attributes_.resize(cols.size()); - for (uint32_t i = 0; i < cols.size(); i++) { - const auto &col_info = cols[i]; - attributes_[i].type = codegen::type::Type{col_info.type, true}; - attributes_[i].attribute_id = i; - attributes_[i].name = col_info.name; - } -} - -inline PlanNodeType CSVScanPlan::GetPlanNodeType() const { - return PlanNodeType::CSVSCAN; -} - -inline std::unique_ptr CSVScanPlan::Copy() const { - std::vector new_cols; - for (const auto &attribute : attributes_) { - new_cols.push_back(CSVScanPlan::ColumnInfo{.name = attribute.name, - .type = attribute.type.type_id}); - } - return std::unique_ptr( - new CSVScanPlan(file_name_, std::move(new_cols))); -} - -inline void CSVScanPlan::PerformBinding(BindingContext &binding_context) { - for (uint32_t i = 0; i < attributes_.size(); i++) { - binding_context.BindNew(i, &attributes_[i]); - } -} - -inline void CSVScanPlan::GetOutputColumns(std::vector &columns) const { - columns.clear(); - columns.resize(attributes_.size()); - std::iota(columns.begin(), columns.end(), 0); -} - -inline hash_t CSVScanPlan::Hash() const { - hash_t hash = HashUtil::HashBytes(file_name_.data(), file_name_.length()); - hash = HashUtil::CombineHashes(hash, HashUtil::Hash(&delimiter_)); - hash = HashUtil::CombineHashes(hash, HashUtil::Hash("e_)); - hash = HashUtil::CombineHashes(hash, HashUtil::Hash(&escape_)); - hash = HashUtil::CombineHashes( - hash, HashUtil::HashBytes(null_.c_str(), null_.length())); - return hash; -} - -inline bool CSVScanPlan::operator==(const AbstractPlan &rhs) const { - if (rhs.GetPlanNodeType() != PlanNodeType::CSVSCAN) return false; - const auto &other = static_cast(rhs); - return ( - (StringUtil::Upper(file_name_) == StringUtil::Upper(other.file_name_)) && - delimiter_ == other.delimiter_ && quote_ == other.quote_ && - escape_ == other.escape_); -} - -inline void CSVScanPlan::GetAttributes( - std::vector &ais) const { - ais.clear(); - for (const auto &ai : attributes_) { - ais.push_back(&ai); - } -} - } // namespace planner } // namespace peloton \ No newline at end of file diff --git a/src/include/planner/export_external_file_plan.h b/src/include/planner/export_external_file_plan.h index 6962891a19d..7dfb5807422 100644 --- a/src/include/planner/export_external_file_plan.h +++ b/src/include/planner/export_external_file_plan.h @@ -12,12 +12,20 @@ #pragma once -#include "concurrency/transaction_context.h" +#include +#include +#include + #include "planner/abstract_plan.h" namespace peloton { namespace planner { +/** + * This is the plan node when exporting data from the database into an external + * file. It is configured with the name of the file to write content into, and + * the delimiter, quote, and escape characters to use when writing content. + */ class ExportExternalFilePlan : public AbstractPlan { public: ExportExternalFilePlan(std::string file_name, char delimiter = ',', @@ -61,59 +69,5 @@ class ExportExternalFilePlan : public AbstractPlan { char escape_; }; -//////////////////////////////////////////////////////////////////////////////// -/// -/// Implementation below -/// -//////////////////////////////////////////////////////////////////////////////// - -inline ExportExternalFilePlan::ExportExternalFilePlan(std::string file_name, - char delimiter, - char quote, char escape) - : file_name_(file_name), - delimiter_(delimiter), - quote_(quote), - escape_(escape) {} - -inline PlanNodeType ExportExternalFilePlan::GetPlanNodeType() const { - return PlanNodeType::EXPORT_EXTERNAL_FILE; -} - -inline hash_t ExportExternalFilePlan::Hash() const { - hash_t hash = HashUtil::HashBytes(file_name_.data(), file_name_.length()); - hash = HashUtil::CombineHashes(hash, HashUtil::Hash(&delimiter_)); - hash = HashUtil::CombineHashes(hash, HashUtil::Hash("e_)); - hash = HashUtil::CombineHashes(hash, HashUtil::Hash(&escape_)); - return hash; -} - -inline bool ExportExternalFilePlan::operator==(const AbstractPlan &rhs) const { - if (rhs.GetPlanNodeType() != PlanNodeType::EXPORT_EXTERNAL_FILE) return false; - const auto &other = static_cast(rhs); - return ( - (StringUtil::Upper(file_name_) == StringUtil::Upper(other.file_name_)) && - delimiter_ == other.delimiter_ && quote_ == other.quote_ && - escape_ == other.escape_); -} - -inline std::unique_ptr ExportExternalFilePlan::Copy() const { - return std::unique_ptr{ - new ExportExternalFilePlan(file_name_, delimiter_, quote_, escape_)}; -} - -inline void ExportExternalFilePlan::PerformBinding( - BindingContext &binding_context) { - PELOTON_ASSERT(GetChildrenSize() == 1); - auto &child = *GetChild(0); - - std::vector child_output_cols; - child.GetOutputColumns(child_output_cols); - - output_attributes_.clear(); - for (const auto &col_id : child_output_cols) { - output_attributes_.push_back(binding_context.Find(col_id)); - } -} - } // namespace planner } // namespace peloton \ No newline at end of file diff --git a/src/planner/csv_scan_plan.cpp b/src/planner/csv_scan_plan.cpp new file mode 100644 index 00000000000..c4ff66765e9 --- /dev/null +++ b/src/planner/csv_scan_plan.cpp @@ -0,0 +1,93 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// csv_scan_plan.cpp +// +// Identification: src/planner/csv_scan_plan.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include "planner/csv_scan_plan.h" + +#include + +#include "codegen/type/type.h" + +namespace peloton { +namespace planner { + +CSVScanPlan::CSVScanPlan(std::string file_name, + std::vector &&cols, + char delimiter, char quote, char escape, + std::string null) + : file_name_(std::move(file_name)), + delimiter_(delimiter), + quote_(quote), + escape_(escape), + null_(null) { + attributes_.resize(cols.size()); + for (uint32_t i = 0; i < cols.size(); i++) { + const auto &col_info = cols[i]; + attributes_[i].type = codegen::type::Type{col_info.type, true}; + attributes_[i].attribute_id = i; + attributes_[i].name = col_info.name; + } +} + +PlanNodeType CSVScanPlan::GetPlanNodeType() const { + return PlanNodeType::CSVSCAN; +} + +std::unique_ptr CSVScanPlan::Copy() const { + std::vector new_cols; + for (const auto &attribute : attributes_) { + new_cols.push_back(CSVScanPlan::ColumnInfo{.name = attribute.name, + .type = attribute.type.type_id}); + } + return std::unique_ptr( + new CSVScanPlan(file_name_, std::move(new_cols))); +} + +void CSVScanPlan::PerformBinding(BindingContext &binding_context) { + for (uint32_t i = 0; i < attributes_.size(); i++) { + binding_context.BindNew(i, &attributes_[i]); + } +} + +void CSVScanPlan::GetOutputColumns(std::vector &columns) const { + columns.clear(); + columns.resize(attributes_.size()); + std::iota(columns.begin(), columns.end(), 0); +} + +hash_t CSVScanPlan::Hash() const { + hash_t hash = HashUtil::HashBytes(file_name_.data(), file_name_.length()); + hash = HashUtil::CombineHashes(hash, HashUtil::Hash(&delimiter_)); + hash = HashUtil::CombineHashes(hash, HashUtil::Hash("e_)); + hash = HashUtil::CombineHashes(hash, HashUtil::Hash(&escape_)); + hash = HashUtil::CombineHashes( + hash, HashUtil::HashBytes(null_.c_str(), null_.length())); + return hash; +} + +bool CSVScanPlan::operator==(const AbstractPlan &rhs) const { + if (rhs.GetPlanNodeType() != PlanNodeType::CSVSCAN) return false; + const auto &other = static_cast(rhs); + return ( + (StringUtil::Upper(file_name_) == StringUtil::Upper(other.file_name_)) && + delimiter_ == other.delimiter_ && quote_ == other.quote_ && + escape_ == other.escape_); +} + +void CSVScanPlan::GetAttributes(std::vector &ais) const { + ais.clear(); + for (const auto &ai : attributes_) { + ais.push_back(&ai); + } +} + +} // namespace planner +} // namespace peloton \ No newline at end of file diff --git a/src/planner/export_external_file_plan.cpp b/src/planner/export_external_file_plan.cpp new file mode 100644 index 00000000000..8f63cc1a072 --- /dev/null +++ b/src/planner/export_external_file_plan.cpp @@ -0,0 +1,70 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// export_external_file_plan.cpp +// +// Identification: src/planner/export_external_file_plan.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include "planner/export_external_file_plan.h" + +#include "common/macros.h" +#include "util/hash_util.h" +#include "util/string_util.h" + +namespace peloton { +namespace planner { + +ExportExternalFilePlan::ExportExternalFilePlan(std::string file_name, + char delimiter, char quote, + char escape) + : file_name_(file_name), + delimiter_(delimiter), + quote_(quote), + escape_(escape) {} + +PlanNodeType ExportExternalFilePlan::GetPlanNodeType() const { + return PlanNodeType::EXPORT_EXTERNAL_FILE; +} + +hash_t ExportExternalFilePlan::Hash() const { + hash_t hash = HashUtil::HashBytes(file_name_.data(), file_name_.length()); + hash = HashUtil::CombineHashes(hash, HashUtil::Hash(&delimiter_)); + hash = HashUtil::CombineHashes(hash, HashUtil::Hash("e_)); + hash = HashUtil::CombineHashes(hash, HashUtil::Hash(&escape_)); + return hash; +} + +bool ExportExternalFilePlan::operator==(const AbstractPlan &rhs) const { + if (rhs.GetPlanNodeType() != PlanNodeType::EXPORT_EXTERNAL_FILE) return false; + const auto &other = static_cast(rhs); + return ( + (StringUtil::Upper(file_name_) == StringUtil::Upper(other.file_name_)) && + delimiter_ == other.delimiter_ && quote_ == other.quote_ && + escape_ == other.escape_); +} + +std::unique_ptr ExportExternalFilePlan::Copy() const { + return std::unique_ptr{ + new ExportExternalFilePlan(file_name_, delimiter_, quote_, escape_)}; +} + +void ExportExternalFilePlan::PerformBinding(BindingContext &binding_context) { + PELOTON_ASSERT(GetChildrenSize() == 1); + auto &child = *GetChild(0); + + std::vector child_output_cols; + child.GetOutputColumns(child_output_cols); + + output_attributes_.clear(); + for (const auto &col_id : child_output_cols) { + output_attributes_.push_back(binding_context.Find(col_id)); + } +} + +} // namespace planner +} // namespace peloton \ No newline at end of file From 0483a6de8385060cd2f314ddf08f0c350278faa0 Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Wed, 16 May 2018 17:51:12 -0400 Subject: [PATCH 32/42] * Propagatge file options through optimization. * Added codegen.cpp to source validator whitelist, since we have the ability to call printf() from codegen for debug. * Beefed up overflow checks in NumericRuntime. * Fixed tests. --- script/validators/source_validator.py | 4 +- src/function/numeric_functions.cpp | 34 ++++++++++--- src/include/optimizer/operators.h | 24 +++++++-- src/include/parser/copy_statement.h | 6 ++- src/optimizer/operators.cpp | 50 ++++++++++++++++--- src/optimizer/plan_generator.cpp | 6 ++- .../query_to_operator_transformer.cpp | 8 +-- src/optimizer/rule_impls.cpp | 8 +-- src/parser/postgresparser.cpp | 16 +++++- test/codegen/csv_scan_test.cpp | 49 +++++++++++++++--- test/codegen/value_integrity_test.cpp | 7 ++- test/common/internal_types_test.cpp | 4 +- ...ns_test.cpp => numeric_functions_test.cpp} | 25 +++++----- 13 files changed, 185 insertions(+), 56 deletions(-) rename test/function/{decimal_functions_test.cpp => numeric_functions_test.cpp} (93%) diff --git a/script/validators/source_validator.py b/script/validators/source_validator.py index 331bdc7c688..ad70c24c7e5 100755 --- a/script/validators/source_validator.py +++ b/script/validators/source_validator.py @@ -58,12 +58,12 @@ "src/network/protocol.cpp", "src/include/common/macros.h", "src/common/stack_trace.cpp", - "src/include/parser/sql_scanner.h", # There is a free() in comments "src/include/index/bloom_filter.h", "src/include/index/compact_ints_key.h", "src/include/index/bwtree.h", "src/codegen/util/oa_hash_table.cpp", - "src/codegen/util/cc_hash_table.cpp" + "src/codegen/util/cc_hash_table.cpp", + "src/codegen/codegen.cpp", # We allow calling printf() from codegen for debugging ] ## ============================================== diff --git a/src/function/numeric_functions.cpp b/src/function/numeric_functions.cpp index f4a943c8ce0..f0d13e92ffc 100644 --- a/src/function/numeric_functions.cpp +++ b/src/function/numeric_functions.cpp @@ -216,13 +216,25 @@ T ParseInteger(const char *ptr, uint32_t len) { } // Convert - int64_t num = 0; + uint64_t cutoff = + static_cast(negative ? -std::numeric_limits::min() + : std::numeric_limits::max()); + uint64_t cutlimit = cutoff % 10; + cutoff /= 10; + + uint64_t num = 0; while (start < end) { if (*start < '0' || *start > '9') { break; } - num = (num * 10) + (*start - '0'); + uint32_t c = static_cast(*start - '0'); + + if (num > cutoff || (num == cutoff && c > cutlimit)) { + goto overflow; + } + + num = (num * 10) + c; start++; } @@ -234,8 +246,7 @@ T ParseInteger(const char *ptr, uint32_t len) { // If we haven't consumed everything at this point, it was an invalid input if (start < end) { - codegen::RuntimeFunctions::ThrowInvalidInputStringException(); - __builtin_unreachable(); + goto invalid; } // Negate number if we need to @@ -244,14 +255,21 @@ T ParseInteger(const char *ptr, uint32_t len) { } // Range check - if (num <= std::numeric_limits::min() || - num >= std::numeric_limits::max()) { - codegen::RuntimeFunctions::ThrowOverflowException(); - __builtin_unreachable(); + if (static_cast(num) <= std::numeric_limits::min() || + static_cast(num) >= std::numeric_limits::max()) { + goto overflow; } // Done return static_cast(num); + +overflow: + codegen::RuntimeFunctions::ThrowOverflowException(); + __builtin_unreachable(); + +invalid: + codegen::RuntimeFunctions::ThrowInvalidInputStringException(); + __builtin_unreachable(); } } // namespace diff --git a/src/include/optimizer/operators.h b/src/include/optimizer/operators.h index 8a7c7582e56..d51d66b01e8 100644 --- a/src/include/optimizer/operators.h +++ b/src/include/optimizer/operators.h @@ -73,7 +73,8 @@ class LogicalGet : public OperatorNode { class LogicalExternalFileGet : public OperatorNode { public: static Operator make(oid_t get_id, ExternalFileFormat format, - std::string file_name); + std::string file_name, char delimiter, char quote, + char escape); bool operator==(const BaseOperatorNode &r) override; @@ -83,6 +84,9 @@ class LogicalExternalFileGet : public OperatorNode { oid_t get_id; ExternalFileFormat format; std::string file_name; + char delimiter; + char quote; + char escape; }; //===--------------------------------------------------------------------===// @@ -330,7 +334,8 @@ class LogicalUpdate : public OperatorNode { class LogicalExportExternalFile : public OperatorNode { public: - static Operator make(ExternalFileFormat format, std::string file_name); + static Operator make(ExternalFileFormat format, std::string file_name, + char delimiter, char quote, char escape); bool operator==(const BaseOperatorNode &r) override; @@ -338,6 +343,9 @@ class LogicalExportExternalFile ExternalFileFormat format; std::string file_name; + char delimiter; + char quote; + char escape; }; //===--------------------------------------------------------------------===// @@ -410,7 +418,8 @@ class PhysicalIndexScan : public OperatorNode { class ExternalFileScan : public OperatorNode { public: static Operator make(oid_t get_id, ExternalFileFormat format, - std::string file_name); + std::string file_name, char delimiter, char quote, + char escape); bool operator==(const BaseOperatorNode &r) override; @@ -420,6 +429,9 @@ class ExternalFileScan : public OperatorNode { oid_t get_id; ExternalFileFormat format; std::string file_name; + char delimiter; + char quote; + char escape; }; //===--------------------------------------------------------------------===// @@ -617,7 +629,8 @@ class PhysicalUpdate : public OperatorNode { class PhysicalExportExternalFile : public OperatorNode { public: - static Operator make(ExternalFileFormat format, std::string file_name); + static Operator make(ExternalFileFormat format, std::string file_name, + char delimiter, char quote, char escape); bool operator==(const BaseOperatorNode &r) override; @@ -625,6 +638,9 @@ class PhysicalExportExternalFile ExternalFileFormat format; std::string file_name; + char delimiter; + char quote; + char escape; }; //===--------------------------------------------------------------------===// diff --git a/src/include/parser/copy_statement.h b/src/include/parser/copy_statement.h index 8145cd695e9..67e8fe5ee25 100644 --- a/src/include/parser/copy_statement.h +++ b/src/include/parser/copy_statement.h @@ -62,11 +62,13 @@ class CopyStatement : public SQLStatement { std::string file_path; // The format of the file - ExternalFileFormat format; + ExternalFileFormat format = ExternalFileFormat::CSV; bool is_from; - char delimiter; + char delimiter = ','; + char quote = '"'; + char escape = '"'; }; } // namespace parser diff --git a/src/optimizer/operators.cpp b/src/optimizer/operators.cpp index e168a4d4bea..6457e769db2 100644 --- a/src/optimizer/operators.cpp +++ b/src/optimizer/operators.cpp @@ -68,11 +68,15 @@ bool LogicalGet::operator==(const BaseOperatorNode &r) { //===--------------------------------------------------------------------===// Operator LogicalExternalFileGet::make(oid_t get_id, ExternalFileFormat format, - std::string file_name) { + std::string file_name, char delimiter, + char quote, char escape) { auto *get = new LogicalExternalFileGet(); get->get_id = get_id; get->format = format; get->file_name = std::move(file_name); + get->delimiter = delimiter; + get->quote = quote; + get->escape = escape; return Operator(get); } @@ -80,7 +84,8 @@ bool LogicalExternalFileGet::operator==(const BaseOperatorNode &node) { if (node.GetType() != OpType::LogicalExternalFileGet) return false; const auto &get = *static_cast(&node); return (get_id == get.get_id && format == get.format && - file_name == get.file_name); + file_name == get.file_name && delimiter == get.delimiter && + quote == get.quote && escape == get.escape); } hash_t LogicalExternalFileGet::Hash() const { @@ -89,6 +94,9 @@ hash_t LogicalExternalFileGet::Hash() const { hash = HashUtil::CombineHashes(hash, HashUtil::Hash(&format)); hash = HashUtil::CombineHashes( hash, HashUtil::HashBytes(file_name.data(), file_name.length())); + hash = HashUtil::CombineHashes(hash, HashUtil::HashBytes(&delimiter, 1)); + hash = HashUtil::CombineHashes(hash, HashUtil::HashBytes("e, 1)); + hash = HashUtil::CombineHashes(hash, HashUtil::HashBytes(&escape, 1)); return hash; } @@ -446,10 +454,14 @@ Operator LogicalLimit::make(int64_t offset, int64_t limit) { // External file output //===--------------------------------------------------------------------===// Operator LogicalExportExternalFile::make(ExternalFileFormat format, - std::string file_name) { + std::string file_name, char delimiter, + char quote, char escape) { auto *export_op = new LogicalExportExternalFile(); export_op->format = format; export_op->file_name = std::move(file_name); + export_op->delimiter = delimiter; + export_op->quote = quote; + export_op->escape = escape; return Operator(export_op); } @@ -457,7 +469,9 @@ bool LogicalExportExternalFile::operator==(const BaseOperatorNode &node) { if (node.GetType() != OpType::LogicalExportExternalFile) return false; const auto &export_op = *static_cast(&node); - return (format == export_op.format && file_name == export_op.file_name); + return (format == export_op.format && file_name == export_op.file_name && + delimiter == export_op.delimiter && quote == export_op.quote && + escape == export_op.escape); } hash_t LogicalExportExternalFile::Hash() const { @@ -465,6 +479,9 @@ hash_t LogicalExportExternalFile::Hash() const { hash = HashUtil::CombineHashes(hash, HashUtil::Hash(&format)); hash = HashUtil::CombineHashes( hash, HashUtil::HashBytes(file_name.data(), file_name.length())); + hash = HashUtil::CombineHashes(hash, HashUtil::HashBytes(&delimiter, 1)); + hash = HashUtil::CombineHashes(hash, HashUtil::HashBytes("e, 1)); + hash = HashUtil::CombineHashes(hash, HashUtil::HashBytes(&escape, 1)); return hash; } @@ -567,11 +584,15 @@ hash_t PhysicalIndexScan::Hash() const { // Physical external file scan //===--------------------------------------------------------------------===// Operator ExternalFileScan::make(oid_t get_id, ExternalFileFormat format, - std::string file_name) { + std::string file_name, char delimiter, + char quote, char escape) { auto *get = new ExternalFileScan(); get->get_id = get_id; get->format = format; get->file_name = file_name; + get->delimiter = delimiter; + get->quote = quote; + get->escape = escape; return Operator(get); } @@ -579,7 +600,8 @@ bool ExternalFileScan::operator==(const BaseOperatorNode &node) { if (node.GetType() != OpType::QueryDerivedScan) return false; const auto &get = *static_cast(&node); return (get_id == get.get_id && format == get.format && - file_name == get.file_name); + file_name == get.file_name && delimiter == get.delimiter && + quote == get.quote && escape == get.escape); } hash_t ExternalFileScan::Hash() const { @@ -588,6 +610,9 @@ hash_t ExternalFileScan::Hash() const { hash = HashUtil::CombineHashes(hash, HashUtil::Hash(&format)); hash = HashUtil::CombineHashes( hash, HashUtil::HashBytes(file_name.data(), file_name.length())); + hash = HashUtil::CombineHashes(hash, HashUtil::HashBytes(&delimiter, 1)); + hash = HashUtil::CombineHashes(hash, HashUtil::HashBytes("e, 1)); + hash = HashUtil::CombineHashes(hash, HashUtil::HashBytes(&escape, 1)); return hash; } @@ -845,10 +870,14 @@ Operator PhysicalUpdate::make( // PhysicalExportExternalFile //===--------------------------------------------------------------------===// Operator PhysicalExportExternalFile::make(ExternalFileFormat format, - std::string file_name) { + std::string file_name, char delimiter, + char quote, char escape) { auto *export_op = new PhysicalExportExternalFile(); export_op->format = format; export_op->file_name = file_name; + export_op->delimiter = delimiter; + export_op->quote = quote; + export_op->escape = escape; return Operator(export_op); } @@ -856,7 +885,9 @@ bool PhysicalExportExternalFile::operator==(const BaseOperatorNode &node) { if (node.GetType() != OpType::ExportExternalFile) return false; const auto &export_op = *static_cast(&node); - return (format == export_op.format && file_name == export_op.file_name); + return (format == export_op.format && file_name == export_op.file_name && + delimiter == export_op.delimiter && quote == export_op.quote && + escape == export_op.escape); } hash_t PhysicalExportExternalFile::Hash() const { @@ -864,6 +895,9 @@ hash_t PhysicalExportExternalFile::Hash() const { hash = HashUtil::CombineHashes(hash, HashUtil::Hash(&format)); hash = HashUtil::CombineHashes( hash, HashUtil::HashBytes(file_name.data(), file_name.length())); + hash = HashUtil::CombineHashes(hash, HashUtil::HashBytes(&delimiter, 1)); + hash = HashUtil::CombineHashes(hash, HashUtil::HashBytes("e, 1)); + hash = HashUtil::CombineHashes(hash, HashUtil::HashBytes(&escape, 1)); return hash; } diff --git a/src/optimizer/plan_generator.cpp b/src/optimizer/plan_generator.cpp index c2c2dcc399a..671ef94dabd 100644 --- a/src/optimizer/plan_generator.cpp +++ b/src/optimizer/plan_generator.cpp @@ -143,7 +143,8 @@ void PlanGenerator::Visit(const ExternalFileScan *op) { // Create the plan output_plan_.reset( - new planner::CSVScanPlan(op->file_name, std::move(cols))); + new planner::CSVScanPlan(op->file_name, std::move(cols), + op->delimiter, op->quote, op->escape)); break; } } @@ -388,7 +389,8 @@ void PlanGenerator::Visit(const PhysicalUpdate *op) { void PlanGenerator::Visit(const PhysicalExportExternalFile *op) { unique_ptr export_plan{ - new planner::ExportExternalFilePlan(op->file_name)}; + new planner::ExportExternalFilePlan(op->file_name, op->delimiter, + op->quote, op->escape)}; export_plan->AddChild(move(children_plans_[0])); output_plan_ = move(export_plan); } diff --git a/src/optimizer/query_to_operator_transformer.cpp b/src/optimizer/query_to_operator_transformer.cpp index 73c52f9266e..56925c3b117 100644 --- a/src/optimizer/query_to_operator_transformer.cpp +++ b/src/optimizer/query_to_operator_transformer.cpp @@ -367,7 +367,8 @@ void QueryToOperatorTransformer::Visit(parser::CopyStatement *op) { auto get_op = std::make_shared(LogicalExternalFileGet::make( - GetAndIncreaseGetId(), op->format, op->file_path)); + GetAndIncreaseGetId(), op->format, op->file_path, op->delimiter, + op->quote, op->escape)); auto target_table = catalog::Catalog::GetInstance() @@ -386,8 +387,9 @@ void QueryToOperatorTransformer::Visit(parser::CopyStatement *op) { } else { op->table->Accept(this); } - auto export_op = std::make_shared( - LogicalExportExternalFile::make(op->format, op->file_path)); + auto export_op = + std::make_shared(LogicalExportExternalFile::make( + op->format, op->file_path, op->delimiter, op->quote, op->escape)); export_op->PushChild(output_expr_); output_expr_ = export_op; } diff --git a/src/optimizer/rule_impls.cpp b/src/optimizer/rule_impls.cpp index 9d0a4624c2a..33fb241df8d 100644 --- a/src/optimizer/rule_impls.cpp +++ b/src/optimizer/rule_impls.cpp @@ -455,7 +455,8 @@ void LogicalExternalFileGetToPhysical::Transform( const auto *get = input->Op().As(); auto result_plan = std::make_shared( - ExternalFileScan::make(get->get_id, get->format, get->file_name)); + ExternalFileScan::make(get->get_id, get->format, get->file_name, + get->delimiter, get->quote, get->escape)); PELOTON_ASSERT(input->Children().empty()); @@ -837,11 +838,12 @@ void LogicalExportToPhysicalExport::Transform( std::shared_ptr input, std::vector> &transformed, UNUSED_ATTRIBUTE OptimizeContext *context) const { - const auto *logical_export = input->Op().As(); + const auto *export_op = input->Op().As(); auto result_plan = std::make_shared(PhysicalExportExternalFile::make( - logical_export->format, logical_export->file_name)); + export_op->format, export_op->file_name, export_op->delimiter, + export_op->quote, export_op->escape)); std::vector> children = input->Children(); PELOTON_ASSERT(children.size() == 1); diff --git a/src/parser/postgresparser.cpp b/src/parser/postgresparser.cpp index ffbea10e39d..069285fc1a4 100644 --- a/src/parser/postgresparser.cpp +++ b/src/parser/postgresparser.cpp @@ -1508,6 +1508,8 @@ parser::PrepareStatement *PostgresParser::PrepareTransform(PrepareStmt *root) { parser::CopyStatement *PostgresParser::CopyTransform(CopyStmt *root) { static constexpr char kDelimiterTok[] = "delimiter"; static constexpr char kFormatTok[] = "format"; + static constexpr char kQuoteTok[] = "quote"; + static constexpr char kEscapeTok[] = "escape"; // The main return value auto *result = new CopyStatement(); @@ -1538,12 +1540,24 @@ parser::CopyStatement *PostgresParser::CopyTransform(CopyStmt *root) { auto *format_val = reinterpret_cast(def_elem->arg); result->format = StringToExternalFileFormat(format_val->val.str); } + + // Check quote + if (strncmp(def_elem->defname, kQuoteTok, sizeof(kQuoteTok)) == 0) { + auto *quote_val = reinterpret_cast(def_elem->arg); + result->quote = *quote_val->val.str; + } + + // Check escape + if (strncmp(def_elem->defname, kEscapeTok, sizeof(kEscapeTok)) == 0) { + auto *escape_val = reinterpret_cast(def_elem->arg); + result->escape = *escape_val->val.str; + } } return result; } -// Analyze statment is parsed with vacuum statment. +// Analyze statment is parsed with vacuum statement. parser::AnalyzeStatement *PostgresParser::VacuumTransform(VacuumStmt *root) { if (root->options != VACOPT_ANALYZE) { throw NotImplementedException("Vacuum not supported."); diff --git a/test/codegen/csv_scan_test.cpp b/test/codegen/csv_scan_test.cpp index f40fc823e80..89da65be90d 100644 --- a/test/codegen/csv_scan_test.cpp +++ b/test/codegen/csv_scan_test.cpp @@ -14,6 +14,9 @@ #include "codegen/util/csv_scanner.h" #include "common/timer.h" +#include "function/date_functions.h" +#include "function/numeric_functions.h" +#include "function/string_functions.h" #include "util/file_util.h" namespace peloton { @@ -92,25 +95,59 @@ TEST_F(CSVScanTest, SimpleNumericScan) { TEST_F(CSVScanTest, MixedStringScan) { // Create a temporary CSV file - std::vector rows = {"1,2,3,test", "4,5,6,\"test\"", - "8,9,10,\"test\nnewline\ninquote\""}; + std::vector rows = { + "1,1994-01-01,3,test", "4,2018-01-01,6,\"test\"", + "8,2016-05-05,10,\"test\nnewline\ninquote\""}; std::vector types = {{type::TypeId::INTEGER, false}, - {type::TypeId::INTEGER, false}, + {type::TypeId::DATE, false}, {type::TypeId::INTEGER, false}, {type::TypeId::VARCHAR, false}}; - uint32_t rows_read = 0; + std::vector rows_read; IterateAsCSV(rows, types, [&rows_read, &types]( const codegen::util::CSVScanner::Column *cols) { - rows_read++; + std::string row; for (uint32_t i = 0; i < types.size(); i++) { EXPECT_FALSE(cols[i].is_null); EXPECT_GT(cols[i].len, 0); + if (i > 0) row.append(","); + switch (types[i].type_id) { + case type::TypeId::INTEGER: { + row.append(std::to_string(function::NumericFunctions::InputInteger( + types[i], cols[i].ptr, cols[i].len))); + break; + } + case type::TypeId::DATE: { + auto raw_date = function::DateFunctions::InputDate( + types[i], cols[i].ptr, cols[i].len); + int32_t year, month, day; + function::DateFunctions::JulianToDate(raw_date, year, month, day); + row.append(StringUtil::Format("%u-%02u-%02u", year, month, day)); + break; + } + case type::TypeId::VARCHAR: { + auto ret = function::StringFunctions::InputString( + types[i], cols[i].ptr, cols[i].len); + row.append(std::string{ret.str, ret.length - 1}); + break; + } + default: { + throw Exception{StringUtil::Format( + "Did not expect column type '%s' in test. Did you forget to " + "modify the switch statement to handle a column type you've added" + "in the test case?", + TypeIdToString(types[i].type_id).c_str())}; + } + } } + rows_read.push_back(row); }); // Check - EXPECT_EQ(rows.size(), rows_read); + ASSERT_EQ(rows.size(), rows_read.size()); + for (uint32_t i = 0; i < rows.size(); i++) { + EXPECT_EQ(rows[i], rows_read[i]); + } } } // namespace test diff --git a/test/codegen/value_integrity_test.cpp b/test/codegen/value_integrity_test.cpp index 87450683afc..0057721352b 100644 --- a/test/codegen/value_integrity_test.cpp +++ b/test/codegen/value_integrity_test.cpp @@ -12,6 +12,8 @@ #include "codegen/testing_codegen_util.h" +#include + #include "codegen/function_builder.h" #include "codegen/type/tinyint_type.h" #include "codegen/type/smallint_type.h" @@ -198,8 +200,9 @@ void TestInputIntegral( // Default overflow tests std::vector overflow_tests = { - std::to_string(static_cast(std::numeric_limits::min()) - 1), - std::to_string(static_cast(std::numeric_limits::max()) + 1)}; + std::to_string(std::numeric_limits::min()) + "1", + std::to_string(std::numeric_limits::max()) + "1", + "123456789123456789123456789"}; overflow_tests.insert(overflow_tests.end(), extra_overflow_tests.begin(), extra_overflow_tests.end()); diff --git a/test/common/internal_types_test.cpp b/test/common/internal_types_test.cpp index c9782514fc6..7a616315e20 100644 --- a/test/common/internal_types_test.cpp +++ b/test/common/internal_types_test.cpp @@ -325,8 +325,8 @@ TEST_F(InternalTypesTests, PlanNodeTypeTest) { PlanNodeType::ORDERBY, PlanNodeType::PROJECTION, PlanNodeType::MATERIALIZE, PlanNodeType::LIMIT, PlanNodeType::DISTINCT, PlanNodeType::SETOP, PlanNodeType::APPEND, PlanNodeType::AGGREGATE_V2, - PlanNodeType::HASH, PlanNodeType::RESULT, PlanNodeType::COPY, - PlanNodeType::MOCK}; + PlanNodeType::HASH, PlanNodeType::RESULT, + PlanNodeType::EXPORT_EXTERNAL_FILE, PlanNodeType::MOCK}; // Make sure that ToString and FromString work for (auto val : list) { diff --git a/test/function/decimal_functions_test.cpp b/test/function/numeric_functions_test.cpp similarity index 93% rename from test/function/decimal_functions_test.cpp rename to test/function/numeric_functions_test.cpp index 1ef4f7cd87c..35622209fde 100644 --- a/test/function/decimal_functions_test.cpp +++ b/test/function/numeric_functions_test.cpp @@ -2,18 +2,18 @@ // // Peloton // -// decimal_functions_test.cpp +// numeric_functions_test.cpp // // Identification: test/expression/decimal_functions_test.cpp // -// Copyright (c) 2015-17, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// -#include +#include +#include #include #include -#include #include "common/harness.h" @@ -21,7 +21,6 @@ #include "common/internal_types.h" #include "type/value.h" #include "type/value_factory.h" -#include "util/string_util.h" using ::testing::NotNull; using ::testing::Return; @@ -29,9 +28,9 @@ using ::testing::Return; namespace peloton { namespace test { -class DecimalFunctionsTests : public PelotonTest {}; +class NumericFunctionsTest : public PelotonTest {}; -TEST_F(DecimalFunctionsTests, SqrtTest) { +TEST_F(NumericFunctionsTest, SqrtTest) { const double column_val = 9.0; const double expected = sqrt(9.0); std::vector args = { @@ -47,7 +46,7 @@ TEST_F(DecimalFunctionsTests, SqrtTest) { EXPECT_TRUE(result.IsNull()); } -TEST_F(DecimalFunctionsTests, FloorTest) { +TEST_F(NumericFunctionsTest, FloorTest) { // Testing Floor with DecimalTypes std::vector inputs = {9.5, 3.3, -4.4, 0.0}; std::vector args; @@ -89,7 +88,7 @@ TEST_F(DecimalFunctionsTests, FloorTest) { EXPECT_TRUE(result.IsNull()); } -TEST_F(DecimalFunctionsTests, RoundTest) { +TEST_F(NumericFunctionsTest, RoundTest) { std::vector column_vals = {9.5, 3.3, -4.4, -5.5, 0.0}; std::vector args; for (double val : column_vals) { @@ -105,7 +104,7 @@ TEST_F(DecimalFunctionsTests, RoundTest) { EXPECT_TRUE(result.IsNull()); } -TEST_F(DecimalFunctionsTests,AbsTestDouble) { +TEST_F(NumericFunctionsTest,AbsTestDouble) { std::vector doubleTestInputs = {9.5, -2.5, -4.4, 0.0}; std::vector args; for (double in : doubleTestInputs) { @@ -121,7 +120,7 @@ TEST_F(DecimalFunctionsTests,AbsTestDouble) { EXPECT_TRUE(result.IsNull()); } -TEST_F(DecimalFunctionsTests, AbsTestInt) { +TEST_F(NumericFunctionsTest, AbsTestInt) { std::vector bigIntTestInputs = {-20, -15, -10, 0, 10, 20}; std::vector intTestInputs = {-20, -15, -10, 0, 10, 20}; std::vector smallIntTestInputs = {-20, -15, -10, 0, 10, 20}; @@ -158,7 +157,7 @@ TEST_F(DecimalFunctionsTests, AbsTestInt) { } } -TEST_F(DecimalFunctionsTests, CeilTestDouble) { +TEST_F(NumericFunctionsTest, CeilTestDouble) { std::vector doubleTestInputs = {-36.0, -35.222, -0.7, -0.5, -0.2, 0.0, 0.2, 0.5, 0.7, 35.2, 36.0, 37.2222}; @@ -175,7 +174,7 @@ TEST_F(DecimalFunctionsTests, CeilTestDouble) { EXPECT_TRUE(result.IsNull()); } -TEST_F(DecimalFunctionsTests, CeilTestInt) { +TEST_F(NumericFunctionsTest, CeilTestInt) { std::vector bigIntTestInputs = {-20, -15, -10, 0, 10, 20}; std::vector intTestInputs = {-20, -15, -10, 0, 10, 20}; std::vector smallIntTestInputs = {-20, -15, -10, 0, 10, 20}; From 342c4caa69d41e37ba372df16f25d119c2e70505 Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Tue, 22 May 2018 11:24:38 -0400 Subject: [PATCH 33/42] Fixes after rebase --- src/codegen/codegen.cpp | 2 +- src/codegen/operator/csv_scan_translator.cpp | 103 +++++++++++++----- src/codegen/proxy/csv_scanner_proxy.cpp | 7 +- .../codegen/operator/csv_scan_translator.h | 12 +- 4 files changed, 85 insertions(+), 39 deletions(-) diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp index 0f8b426b61c..b35838d16e1 100644 --- a/src/codegen/codegen.cpp +++ b/src/codegen/codegen.cpp @@ -195,7 +195,7 @@ llvm::Value *CodeGen::Memcmp(llvm::Value *ptr1, llvm::Value *ptr2, memcmp_fn = RegisterBuiltin( kMemcmpFnName, llvm::TypeBuilder::get(GetContext()), - reinterpret_cast(printf)); + reinterpret_cast(memcmp)); #if GCC_AT_LEAST_6 #pragma GCC diagnostic pop #endif diff --git a/src/codegen/operator/csv_scan_translator.cpp b/src/codegen/operator/csv_scan_translator.cpp index 9e8880f70c0..f8687518057 100644 --- a/src/codegen/operator/csv_scan_translator.cpp +++ b/src/codegen/operator/csv_scan_translator.cpp @@ -20,6 +20,7 @@ #include "codegen/proxy/csv_scanner_proxy.h" #include "codegen/proxy/runtime_functions_proxy.h" #include "codegen/type/sql_type.h" +#include "codegen/vector.h" #include "planner/csv_scan_plan.h" namespace peloton { @@ -28,23 +29,25 @@ namespace codegen { CSVScanTranslator::CSVScanTranslator(const planner::CSVScanPlan &scan, CompilationContext &context, Pipeline &pipeline) - : OperatorTranslator(context, pipeline), scan_(scan) { + : OperatorTranslator(scan, context, pipeline) { // Register the CSV scanner instance - auto &runtime_state = context.GetRuntimeState(); - scanner_id_ = runtime_state.RegisterState( + auto &query_state = context.GetQueryState(); + scanner_id_ = query_state.RegisterState( "csvScanner", CSVScannerProxy::GetType(GetCodeGen())); // Load information about the attributes output by the scan plan - scan_.GetAttributes(output_attributes_); + scan.GetAttributes(output_attributes_); } -void CSVScanTranslator::InitializeState() { +void CSVScanTranslator::InitializeQueryState() { auto &codegen = GetCodeGen(); + auto &scan = GetPlanAs(); + // Arguments llvm::Value *scanner_ptr = LoadStatePtr(scanner_id_); - llvm::Value *exec_ctx_ptr = GetCompilationContext().GetExecutorContextPtr(); - llvm::Value *file_path = codegen.ConstString(scan_.GetFileName(), "filePath"); + llvm::Value *exec_ctx_ptr = GetExecutorContextPtr(); + llvm::Value *file_path = codegen.ConstString(scan.GetFileName(), "filePath"); auto num_cols = static_cast(output_attributes_.size()); @@ -71,20 +74,24 @@ void CSVScanTranslator::InitializeState() { // Cast the runtime type to an opaque void*. This is because we're calling // into pre-compiled C++ that doesn't know that the dynamically generated // RuntimeState* looks like. - llvm::Value *runtime_state_ptr = codegen->CreatePointerCast( + llvm::Value *query_state_ptr = codegen->CreatePointerCast( codegen.GetState(), codegen.VoidType()->getPointerTo()); // Call CSVScanner::Init() codegen.Call(CSVScannerProxy::Init, {scanner_ptr, exec_ctx_ptr, file_path, output_col_types, - codegen.Const32(num_cols), consumer_func, runtime_state_ptr, - codegen.Const8(scan_.GetDelimiterChar()), - codegen.Const8(scan_.GetQuoteChar()), - codegen.Const8(scan_.GetEscapeChar())}); + codegen.Const32(num_cols), consumer_func, query_state_ptr, + codegen.Const8(scan.GetDelimiterChar()), + codegen.Const8(scan.GetQuoteChar()), + codegen.Const8(scan.GetEscapeChar())}); } namespace { +/** + * This is a deferred column access class configured to load the contents of a + * given column. + */ class CSVColumnAccess : public RowBatch::AttributeAccess { public: CSVColumnAccess(const planner::AttributeInfo *ai, llvm::Value *csv_columns, @@ -94,6 +101,12 @@ class CSVColumnAccess : public RowBatch::AttributeAccess { null_str_(std::move(null_str)), runtime_null_(runtime_null_str) {} + ////////////////////////////////////////////////////////////////////////////// + /// + /// Accessors + /// + ////////////////////////////////////////////////////////////////////////////// + llvm::Value *Columns() const { return csv_columns_; } uint32_t ColumnIndex() const { return ai_->attribute_id; } @@ -102,6 +115,25 @@ class CSVColumnAccess : public RowBatch::AttributeAccess { const type::SqlType &SqlType() const { return ai_->type.GetSqlType(); } + ////////////////////////////////////////////////////////////////////////////// + /// + /// Logic + /// + ////////////////////////////////////////////////////////////////////////////// + + /** + * Check if a column's value is considered NULL. Given a pointer to the + * column's string value, and the length of the string, this function will + * check if the column's value is determined to be NULL. This is done by + * comparing the column's contents with the NULL string configured in the + * CSV scan plan (i.e., provided by the user). + * + * @param codegen The codegen instance + * @param data_ptr A pointer to the column's string value + * @param data_len The length of the column's string value + * @return True if the column is equivalent to the NULL string. False + * otherwise. + */ llvm::Value *IsNull(CodeGen &codegen, llvm::Value *data_ptr, llvm::Value *data_len) const { uint32_t null_str_len = static_cast(null_str_.length()); @@ -127,6 +159,16 @@ class CSVColumnAccess : public RowBatch::AttributeAccess { return check_null.BuildPHI(cmp_res, codegen.ConstBool(false)); } + /** + * Load the value of the given column with the given type, ignoring a null + * check. + * + * @param codegen The codegen instance + * @param type The SQL type of the column + * @param data_ptr A pointer to the column's string representation + * @param data_len The length of the column's string representation + * @return The parsed value + */ Value LoadValueIgnoreNull(CodeGen &codegen, llvm::Value *type, llvm::Value *data_ptr, llvm::Value *data_len) const { @@ -144,6 +186,15 @@ class CSVColumnAccess : public RowBatch::AttributeAccess { } } + /** + * Access this column in the given row. In reality, this function pulls out + * the column information from the CSVScanner state and loads/parses the + * column's value. + * + * @param codegen The codegen instance + * @param row The row. This isn't used. + * @return The value of the column + */ Value Access(CodeGen &codegen, UNUSED_ATTRIBUTE RowBatch::Row &row) override { // Load the type, data pointer and length values for the column auto *type = codegen->CreateConstInBoundsGEP2_32( @@ -178,22 +229,31 @@ class CSVColumnAccess : public RowBatch::AttributeAccess { } private: + // Information about the attribute const planner::AttributeInfo *ai_; + + // A pointer to the array of columns llvm::Value *csv_columns_; + + // The NULL string configured for the CSV scan const std::string null_str_; + + // The runtime NULL string (a constant in LLVM) llvm::Value *runtime_null_; }; } // namespace +// We define the callback/consumer function for CSV parsing here void CSVScanTranslator::DefineAuxiliaryFunctions() { CodeGen &codegen = GetCodeGen(); CompilationContext &cc = GetCompilationContext(); + auto &scan = GetPlanAs(); + // Define consumer function here std::vector arg_types = { - {"runtimeState", - cc.GetRuntimeState().FinalizeType(codegen)->getPointerTo()}}; + {"queryState", cc.GetQueryState().GetType()->getPointerTo()}}; FunctionDeclaration decl{codegen.GetCodeContext(), "consumer", FunctionDeclaration::Visibility::Internal, codegen.VoidType(), arg_types}; @@ -209,13 +269,13 @@ void CSVScanTranslator::DefineAuxiliaryFunctions() { llvm::Value *cols = codegen->CreateLoad(codegen->CreateConstInBoundsGEP2_32( CSVScannerProxy::GetType(codegen), LoadStatePtr(scanner_id_), 0, 1)); - llvm::Value *null_str = codegen.ConstString(scan_.GetNullString(), "null"); + llvm::Value *null_str = codegen.ConstString(scan.GetNullString(), "null"); // Add accessors for all columns into the row batch std::vector column_accessors; for (uint32_t i = 0; i < output_attributes_.size(); i++) { column_accessors.emplace_back(output_attributes_[i], cols, - scan_.GetNullString(), null_str); + scan.GetNullString(), null_str); } for (uint32_t i = 0; i < output_attributes_.size(); i++) { one.AddAttribute(output_attributes_[i], &column_accessors[i]); @@ -238,17 +298,10 @@ void CSVScanTranslator::Produce() const { GetCodeGen().Call(CSVScannerProxy::Produce, {scanner_ptr}); } -void CSVScanTranslator::TearDownState() { +void CSVScanTranslator::TearDownQueryState() { auto *scanner_ptr = LoadStatePtr(scanner_id_); GetCodeGen().Call(CSVScannerProxy::Destroy, {scanner_ptr}); } -std::string CSVScanTranslator::GetName() const { - return StringUtil::Format( - "CSVScan(file: '%s', delimiter: '%c', quote: '%c', escape: '%c')", - scan_.GetFileName().c_str(), scan_.GetDelimiterChar(), - scan_.GetQuoteChar(), scan_.GetEscapeChar()); -} - } // namespace codegen -} // namespace peloton \ No newline at end of file +} // namespace peloton diff --git a/src/codegen/proxy/csv_scanner_proxy.cpp b/src/codegen/proxy/csv_scanner_proxy.cpp index f57a11fe014..c13914fbecd 100644 --- a/src/codegen/proxy/csv_scanner_proxy.cpp +++ b/src/codegen/proxy/csv_scanner_proxy.cpp @@ -18,11 +18,10 @@ namespace peloton { namespace codegen { -DEFINE_TYPE(CSVScanner, "util::CSVScanner", MEMBER(opaque1), MEMBER(cols), - MEMBER(opaque2)); +DEFINE_TYPE(CSVScanner, "util::CSVScanner", opaque1, cols, opaque2); -DEFINE_TYPE(CSVScannerColumn, "util::CSVScanner::Column", MEMBER(type), - MEMBER(ptr), MEMBER(len), MEMBER(is_null)); +DEFINE_TYPE(CSVScannerColumn, "util::CSVScanner::Column", type, ptr, len, + is_null); DEFINE_METHOD(peloton::codegen::util, CSVScanner, Init); DEFINE_METHOD(peloton::codegen::util, CSVScanner, Destroy); diff --git a/src/include/codegen/operator/csv_scan_translator.h b/src/include/codegen/operator/csv_scan_translator.h index 3389e1e5c09..9b7efca8fc6 100644 --- a/src/include/codegen/operator/csv_scan_translator.h +++ b/src/include/codegen/operator/csv_scan_translator.h @@ -36,7 +36,7 @@ class CSVScanTranslator : public OperatorTranslator { CSVScanTranslator(const planner::CSVScanPlan &scan, CompilationContext &context, Pipeline &pipeline); - void InitializeState() override; + void InitializeQueryState() override; void DefineAuxiliaryFunctions() override; @@ -48,20 +48,14 @@ class CSVScanTranslator : public OperatorTranslator { void Consume(ConsumerContext &, RowBatch::Row &) const override {} // Similar to InitializeState(), file scans don't have any state - void TearDownState() override; - - // Get a stringified version of this translator - std::string GetName() const override; + void TearDownQueryState() override; private: - // The plan - const planner::CSVScanPlan &scan_; - // The set of attributes output by the csv scan std::vector output_attributes_; // The scanner state ID - RuntimeState::StateID scanner_id_; + QueryState::Id scanner_id_; // The generated CSV scan consumer function llvm::Function *consumer_func_; From 669cca35f7ca342dc7c142178194d00c500d0a8f Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Thu, 24 May 2018 17:39:00 -0400 Subject: [PATCH 34/42] Simple function to convert tuple to string CSV --- src/codegen/buffering_consumer.cpp | 9 +++++++++ src/include/codegen/buffering_consumer.h | 2 ++ 2 files changed, 11 insertions(+) diff --git a/src/codegen/buffering_consumer.cpp b/src/codegen/buffering_consumer.cpp index 1edf1096b00..7316b8261f9 100644 --- a/src/codegen/buffering_consumer.cpp +++ b/src/codegen/buffering_consumer.cpp @@ -40,6 +40,15 @@ WrappedTuple &WrappedTuple::operator=(const WrappedTuple &o) { return *this; } +std::string WrappedTuple::ToCSV() const { + std::string ret; + for (uint32_t i = 0; i < tuple_.size(); i++) { + if (i != 0) ret.append(","); + ret.append(tuple_[i].ToString()); + } + return ret; +} + //===----------------------------------------------------------------------===// // BufferTuple() Proxy //===----------------------------------------------------------------------===// diff --git a/src/include/codegen/buffering_consumer.h b/src/include/codegen/buffering_consumer.h index 0e537486a3e..5238563c45e 100644 --- a/src/include/codegen/buffering_consumer.h +++ b/src/include/codegen/buffering_consumer.h @@ -42,6 +42,8 @@ class WrappedTuple : public ContainerTuple> { // Assignment WrappedTuple &operator=(const WrappedTuple &o); + std::string ToCSV() const; + // The tuple std::vector tuple_; }; From 3cd74b64dd8e953cc4194a86977e9e56336babf9 Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Thu, 24 May 2018 17:39:31 -0400 Subject: [PATCH 35/42] Fix void* -> i8* conversion --- src/codegen/operator/csv_scan_translator.cpp | 4 ++-- src/include/codegen/proxy/type_builder.h | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/codegen/operator/csv_scan_translator.cpp b/src/codegen/operator/csv_scan_translator.cpp index f8687518057..8084fca9bb7 100644 --- a/src/codegen/operator/csv_scan_translator.cpp +++ b/src/codegen/operator/csv_scan_translator.cpp @@ -74,8 +74,8 @@ void CSVScanTranslator::InitializeQueryState() { // Cast the runtime type to an opaque void*. This is because we're calling // into pre-compiled C++ that doesn't know that the dynamically generated // RuntimeState* looks like. - llvm::Value *query_state_ptr = codegen->CreatePointerCast( - codegen.GetState(), codegen.VoidType()->getPointerTo()); + llvm::Value *query_state_ptr = + codegen->CreatePointerCast(codegen.GetState(), codegen.VoidPtrType()); // Call CSVScanner::Init() codegen.Call(CSVScannerProxy::Init, diff --git a/src/include/codegen/proxy/type_builder.h b/src/include/codegen/proxy/type_builder.h index caab2705f72..cc30f6b5f97 100644 --- a/src/include/codegen/proxy/type_builder.h +++ b/src/include/codegen/proxy/type_builder.h @@ -53,6 +53,9 @@ DEFINE_PRIMITIVE_BUILDER(unsigned long, Int64); DEFINE_PRIMITIVE_BUILDER(long long, Int64); DEFINE_PRIMITIVE_BUILDER(unsigned long long, Int64); DEFINE_PRIMITIVE_BUILDER(double, Double); +DEFINE_PRIMITIVE_BUILDER(void *, VoidPtr); +DEFINE_PRIMITIVE_BUILDER(char *, CharPtr); +DEFINE_PRIMITIVE_BUILDER(unsigned char *, CharPtr); #undef DEFINE_PRIMITIVE_BUILDER /// Const From dbb042efa80913b28da5e9ad57aed5ec2b09204e Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Thu, 24 May 2018 17:40:02 -0400 Subject: [PATCH 36/42] More tests --- test/codegen/csv_scan_test.cpp | 6 -- test/codegen/csv_scan_translator_test.cpp | 102 ++++++++++++++++++++ test/codegen/testing_codegen_util.cpp | 5 + test/include/codegen/testing_codegen_util.h | 23 +++-- 4 files changed, 124 insertions(+), 12 deletions(-) create mode 100644 test/codegen/csv_scan_translator_test.cpp diff --git a/test/codegen/csv_scan_test.cpp b/test/codegen/csv_scan_test.cpp index 89da65be90d..2cebff0873e 100644 --- a/test/codegen/csv_scan_test.cpp +++ b/test/codegen/csv_scan_test.cpp @@ -32,12 +32,6 @@ struct State { CallbackFn callback; }; -struct TempFileHandle { - std::string name; - TempFileHandle(std::string _name) : name(_name) {} - ~TempFileHandle() { boost::filesystem::remove(name); } -}; - void CSVRowCallback(void *s) { auto *state = reinterpret_cast(s); state->callback(state->scanner->GetColumns()); diff --git a/test/codegen/csv_scan_translator_test.cpp b/test/codegen/csv_scan_translator_test.cpp new file mode 100644 index 00000000000..66da8ead0d5 --- /dev/null +++ b/test/codegen/csv_scan_translator_test.cpp @@ -0,0 +1,102 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// +// csv_scan_translator_test.cpp +// +// Identification: test/codegen/csv_scan_translator_test.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include "codegen/testing_codegen_util.h" + +#include "planner/csv_scan_plan.h" +#include "planner/insert_plan.h" +#include "planner/seq_scan_plan.h" +#include "util/file_util.h" + +namespace peloton { +namespace test { + +class CSVScanTranslatorTest : public PelotonCodeGenTest { + public: + CSVScanTranslatorTest() : PelotonCodeGenTest() {} + + oid_t TestTableId1() { return test_table_oids[0]; } + uint32_t NumRowsInTestTable() const { return num_rows_to_insert; } + + private: + uint32_t num_rows_to_insert = 64; +}; + +TEST_F(CSVScanTranslatorTest, IntCsvScan) { + // Test input + std::vector rows = {"1,2,3.9,four", + "5,6,7.4,eight", + "9,10,11.1,\"twelve\""}; + std::string csv_data; + for (const auto &row : rows) { + csv_data.append(row).append("\n"); + } + + /////////////////////////////////////////////////// + /// First insert contents of CSV into test table + /////////////////////////////////////////////////// + { + // Write the contents into a temporary file + TempFileHandle fh{FileUtil::WriteTempFile(csv_data, "", "tmp")}; + + // clang-format off + // NOTE: this schema has to match that of the test table! + std::vector cols = { + planner::CSVScanPlan::ColumnInfo{.name = "1", .type = peloton::type::TypeId::INTEGER}, + planner::CSVScanPlan::ColumnInfo{.name = "2", .type = peloton::type::TypeId::INTEGER}, + planner::CSVScanPlan::ColumnInfo{.name = "3", .type = peloton::type::TypeId::DECIMAL}, + planner::CSVScanPlan::ColumnInfo{.name = "4", .type = peloton::type::TypeId::VARCHAR}, + }; + // clang-format on + std::unique_ptr csv_scan{ + new planner::CSVScanPlan(fh.name, std::move(cols), ',')}; + std::unique_ptr insert{ + new planner::InsertPlan(&GetTestTable(TestTableId1()))}; + + insert->AddChild(std::move(csv_scan)); + + planner::BindingContext ctx; + insert->PerformBinding(ctx); + + codegen::BufferingConsumer consumer{{0, 1, 2, 3}, ctx}; + + // Execute insert + CompileAndExecute(*insert, consumer); + ASSERT_EQ(0, consumer.GetOutputTuples().size()); + } + + /////////////////////////////////////////////////// + /// Now scan test table, comparing results + /////////////////////////////////////////////////// + { + std::unique_ptr scan{new planner::SeqScanPlan( + &GetTestTable(TestTableId1()), nullptr, {0, 1, 2, 3})}; + + planner::BindingContext ctx; + scan->PerformBinding(ctx); + + codegen::BufferingConsumer consumer{{0, 1, 2, 3}, ctx}; + + // Execute insert + CompileAndExecute(*scan, consumer); + + const auto &output = consumer.GetOutputTuples(); + ASSERT_EQ(rows.size(), output.size()); + for (uint32_t i = 0; i < rows.size(); i++) { + EXPECT_EQ(rows[i], output[i].ToCSV()); + } + } +} + +} // namespace test +} // namespace peloton diff --git a/test/codegen/testing_codegen_util.cpp b/test/codegen/testing_codegen_util.cpp index 316b46331d6..a19598e33ed 100644 --- a/test/codegen/testing_codegen_util.cpp +++ b/test/codegen/testing_codegen_util.cpp @@ -12,6 +12,8 @@ #include "codegen/testing_codegen_util.h" +#include + #include "catalog/table_catalog.h" #include "codegen/proxy/runtime_functions_proxy.h" #include "codegen/proxy/value_proxy.h" @@ -28,6 +30,9 @@ namespace peloton { namespace test { +TempFileHandle::TempFileHandle(std::string _name) : name(_name) {} +TempFileHandle::~TempFileHandle() { boost::filesystem::remove(name); } + //===----------------------------------------------------------------------===// // PELOTON CODEGEN TEST //===----------------------------------------------------------------------===// diff --git a/test/include/codegen/testing_codegen_util.h b/test/include/codegen/testing_codegen_util.h index 5dc427f03b1..a017fdedfa5 100644 --- a/test/include/codegen/testing_codegen_util.h +++ b/test/include/codegen/testing_codegen_util.h @@ -6,7 +6,7 @@ // // Identification: test/include/codegen/testing_codegen_util.h // -// Copyright (c) 2015-17, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -40,11 +40,22 @@ using ConstExpressionPtr = using PlanPtr = std::unique_ptr; using ConstPlanPtr = std::unique_ptr; -//===----------------------------------------------------------------------===// -// Common base class for all codegen tests. This class four test tables that all -// the codegen components use. Their ID's are available through the oid_t -// enumeration. -//===----------------------------------------------------------------------===// +/** + * This is a scoped file handle that automatically deletes/removes the file + * with the given name when the class goes out of scope and the destructor is + * called. + */ +struct TempFileHandle { + std::string name; + TempFileHandle(std::string _name); + ~TempFileHandle(); +}; + +/** + * Common base class for all codegen tests. This class four test tables that all + * the codegen components use. Their ID's are available through the oid_t + * enumeration. + */ class PelotonCodeGenTest : public PelotonTest { public: std::string test_db_name = "peloton_codegen"; From 985d329d5f4a82161ffa24cc9e2af016634a3b64 Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Tue, 29 May 2018 16:59:51 -0400 Subject: [PATCH 37/42] Address reviews --- src/codegen/codegen.cpp | 27 ++++++++++++--------- src/include/codegen/codegen.h | 14 +++++------ test/function/numeric_functions_test.cpp | 16 ++++++------ test/include/codegen/testing_codegen_util.h | 8 +++--- 4 files changed, 35 insertions(+), 30 deletions(-) diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp index b35838d16e1..b810fd4c092 100644 --- a/src/codegen/codegen.cpp +++ b/src/codegen/codegen.cpp @@ -69,17 +69,6 @@ llvm::Value *CodeGen::ConstString(const std::string &str_val, return GetBuilder().CreateInBoundsGEP(global_var, {Const32(0), Const32(0)}); } -llvm::Value *CodeGen::ConstType(const type::Type &type) { - auto iter = type_variables_.find(type); - if (iter != type_variables_.end()) { - return iter->second; - } - const type::Type t = type; - llvm::Value *ret = ConstGenericBytes(&type, sizeof(type), "type"); - type_variables_.insert(std::make_pair(t, ret)); - return ret; -} - llvm::Value *CodeGen::ConstGenericBytes(const void *data, uint32_t length, const std::string &name) const { // Create the constant data array that wraps the input data @@ -164,6 +153,14 @@ llvm::Value *CodeGen::Printf(const std::string &format, auto *printf_fn = LookupBuiltin("printf"); if (printf_fn == nullptr) { #if GCC_AT_LEAST_6 +// In newer GCC versions (i.e., GCC 6+), function attributes are part of the +// type system and are attached to the function signature. For example, printf() +// comes with the "noexcept" attribute. Moreover, GCC 6+ will complain when +// attributes attached to a function (e.g., noexcept()) are not used at +// their call-site. Below, we use decltype(printf) to get the C/C++ function +// type of printf(...), but we discard the attributes since we don't need +// them. Hence, on GCC 6+, compilation will fail without adding the +// "-Wignored-attributes" flag. So, we add it here only. #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wignored-attributes" #endif @@ -189,6 +186,14 @@ llvm::Value *CodeGen::Memcmp(llvm::Value *ptr1, llvm::Value *ptr2, auto *memcmp_fn = LookupBuiltin(kMemcmpFnName); if (memcmp_fn == nullptr) { #if GCC_AT_LEAST_6 +// In newer GCC versions (i.e., GCC 6+), function attributes are part of the +// type system and are attached to the function signature. For example, memcmp() +// comes with the "throw()" attribute, among many others. Moreover, GCC 6+ will +// complain when attributes attached to a function are not used at their +// call-site. Below, we use decltype(memcmp) to get the C/C++ function type +// of memcmp(...), but we discard the attributes since we don't need them. +// Hence, on GCC 6+, compilation will fail without adding the +// "-Wignored-attributes" flag. So, we add it here only. #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wignored-attributes" #endif diff --git a/src/include/codegen/codegen.h b/src/include/codegen/codegen.h index 037e01dbe11..9a56edf5dfd 100644 --- a/src/include/codegen/codegen.h +++ b/src/include/codegen/codegen.h @@ -59,9 +59,10 @@ class CppProxyMember { uint32_t slot_; }; -//===----------------------------------------------------------------------===// -// The main wrapper around LLVM's IR Builder to generate IR -//===----------------------------------------------------------------------===// +/** + * The main API used to generate code in Peloton. Provides a thin wrapper around + * LLVM's IR Builder to generate IR. + */ class CodeGen { public: /// Constructor and destructor @@ -89,7 +90,8 @@ class CodeGen { } llvm::Type *ArrayType(llvm::Type *type, uint32_t num_elements) const; - /// Constant wrappers for bool, int8, int16, int32, int64, strings, and null + /// Functions to return LLVM values for constant boolean, int8, int16, int32, + // int64, strings, and null values. llvm::Constant *ConstBool(bool val) const; llvm::Constant *Const8(int8_t val) const; llvm::Constant *Const16(int16_t val) const; @@ -98,7 +100,6 @@ class CodeGen { llvm::Constant *ConstDouble(double val) const; llvm::Value *ConstString(const std::string &str_val, const std::string &name) const; - llvm::Value *ConstType(const type::Type &type); llvm::Value *ConstGenericBytes(const void *data, uint32_t length, const std::string &name) const; llvm::Constant *Null(llvm::Type *type) const; @@ -195,9 +196,6 @@ class CodeGen { private: // The context/module where all the code this class produces goes CodeContext &code_context_; - - std::unordered_map type_variables_; }; } // namespace codegen diff --git a/test/function/numeric_functions_test.cpp b/test/function/numeric_functions_test.cpp index 35622209fde..be700b4fa9f 100644 --- a/test/function/numeric_functions_test.cpp +++ b/test/function/numeric_functions_test.cpp @@ -28,9 +28,9 @@ using ::testing::Return; namespace peloton { namespace test { -class NumericFunctionsTest : public PelotonTest {}; +class NumericFunctionsTests : public PelotonTest {}; -TEST_F(NumericFunctionsTest, SqrtTest) { +TEST_F(NumericFunctionsTests, SqrtTest) { const double column_val = 9.0; const double expected = sqrt(9.0); std::vector args = { @@ -46,7 +46,7 @@ TEST_F(NumericFunctionsTest, SqrtTest) { EXPECT_TRUE(result.IsNull()); } -TEST_F(NumericFunctionsTest, FloorTest) { +TEST_F(NumericFunctionsTests, FloorTest) { // Testing Floor with DecimalTypes std::vector inputs = {9.5, 3.3, -4.4, 0.0}; std::vector args; @@ -88,7 +88,7 @@ TEST_F(NumericFunctionsTest, FloorTest) { EXPECT_TRUE(result.IsNull()); } -TEST_F(NumericFunctionsTest, RoundTest) { +TEST_F(NumericFunctionsTests, RoundTest) { std::vector column_vals = {9.5, 3.3, -4.4, -5.5, 0.0}; std::vector args; for (double val : column_vals) { @@ -104,7 +104,7 @@ TEST_F(NumericFunctionsTest, RoundTest) { EXPECT_TRUE(result.IsNull()); } -TEST_F(NumericFunctionsTest,AbsTestDouble) { +TEST_F(NumericFunctionsTests,AbsTestDouble) { std::vector doubleTestInputs = {9.5, -2.5, -4.4, 0.0}; std::vector args; for (double in : doubleTestInputs) { @@ -120,7 +120,7 @@ TEST_F(NumericFunctionsTest,AbsTestDouble) { EXPECT_TRUE(result.IsNull()); } -TEST_F(NumericFunctionsTest, AbsTestInt) { +TEST_F(NumericFunctionsTests, AbsTestInt) { std::vector bigIntTestInputs = {-20, -15, -10, 0, 10, 20}; std::vector intTestInputs = {-20, -15, -10, 0, 10, 20}; std::vector smallIntTestInputs = {-20, -15, -10, 0, 10, 20}; @@ -157,7 +157,7 @@ TEST_F(NumericFunctionsTest, AbsTestInt) { } } -TEST_F(NumericFunctionsTest, CeilTestDouble) { +TEST_F(NumericFunctionsTests, CeilTestDouble) { std::vector doubleTestInputs = {-36.0, -35.222, -0.7, -0.5, -0.2, 0.0, 0.2, 0.5, 0.7, 35.2, 36.0, 37.2222}; @@ -174,7 +174,7 @@ TEST_F(NumericFunctionsTest, CeilTestDouble) { EXPECT_TRUE(result.IsNull()); } -TEST_F(NumericFunctionsTest, CeilTestInt) { +TEST_F(NumericFunctionsTests, CeilTestInt) { std::vector bigIntTestInputs = {-20, -15, -10, 0, 10, 20}; std::vector intTestInputs = {-20, -15, -10, 0, 10, 20}; std::vector smallIntTestInputs = {-20, -15, -10, 0, 10, 20}; diff --git a/test/include/codegen/testing_codegen_util.h b/test/include/codegen/testing_codegen_util.h index a017fdedfa5..c61a47e67c2 100644 --- a/test/include/codegen/testing_codegen_util.h +++ b/test/include/codegen/testing_codegen_util.h @@ -52,9 +52,11 @@ struct TempFileHandle { }; /** - * Common base class for all codegen tests. This class four test tables that all - * the codegen components use. Their ID's are available through the oid_t - * enumeration. + * Common base class for all codegen tests. This class has four test tables + * whose IDs and names are stored in test_table_oids and test_table_names, + * respectively. The test tables all have the exact schema: column "a" and "b" + * are integers, column "c" is a decimal, and column "d" is a varchar. The table + * with the highest OID also has a primary key on column "a". */ class PelotonCodeGenTest : public PelotonTest { public: From 70f94eccf9f7262cccdc052af2a3fa91dee08a57 Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Tue, 29 May 2018 17:03:35 -0400 Subject: [PATCH 38/42] Revert "Removed serialization" This reverts commit d055ff94b02aef2ccaba86ceee7bb96ce6266d6a. --- src/network/service/peloton_service.cpp | 4 ++-- test/network/rpc_queryplan_test.cpp | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/network/service/peloton_service.cpp b/src/network/service/peloton_service.cpp index 90a5b81ee8f..9e5095a0916 100644 --- a/src/network/service/peloton_service.cpp +++ b/src/network/service/peloton_service.cpp @@ -357,7 +357,7 @@ void PelotonService::QueryPlan(::google::protobuf::RpcController *controller, LOG_ERROR("Queryplan recived desen't have type"); break; } -#if 0 + case PlanNodeType::SEQSCAN: { LOG_TRACE("SEQSCAN revieved"); std::string plan = request->plan(); @@ -400,7 +400,7 @@ void PelotonService::QueryPlan(::google::protobuf::RpcController *controller, break; } -#endif + default: { LOG_ERROR("Queryplan recived :: Unsupported TYPE: %s", PlanNodeTypeToString(plan_type).c_str()); diff --git a/test/network/rpc_queryplan_test.cpp b/test/network/rpc_queryplan_test.cpp index 90b55e06668..cb11891a1db 100644 --- a/test/network/rpc_queryplan_test.cpp +++ b/test/network/rpc_queryplan_test.cpp @@ -22,7 +22,6 @@ namespace test { class RpcQueryPlanTests : public PelotonTest {}; TEST_F(RpcQueryPlanTests, BasicTest) { -#if 0 peloton::planner::SeqScanPlan mapped_plan_ptr; const peloton::PlanNodeType type = mapped_plan_ptr.GetPlanNodeType(); @@ -33,7 +32,6 @@ TEST_F(RpcQueryPlanTests, BasicTest) { bool serialize = mapped_plan_ptr.SerializeTo(output_plan); // Becuase the plan is not completed, so it is false EXPECT_FALSE(serialize); -#endif } } } From b8d0c34c79e0ede45914e3a4db02bf3b2bce15cd Mon Sep 17 00:00:00 2001 From: Prashanth Menon Date: Tue, 29 May 2018 17:03:38 -0400 Subject: [PATCH 39/42] Revert "Removed unused serialization stuff from plan nodes" This reverts commit 74427c78151e6102a5dc61bf2278dcc0cc5f82f3. --- src/include/planner/abstract_plan.h | 27 +++ src/include/planner/abstract_scan_plan.h | 2 + src/include/planner/seq_scan_plan.h | 19 +- src/planner/abstract_plan.cpp | 3 + src/planner/seq_scan_plan.cpp | 227 ++++++++++++++++++++++- 5 files changed, 267 insertions(+), 11 deletions(-) diff --git a/src/include/planner/abstract_plan.h b/src/include/planner/abstract_plan.h index bb1428f81d4..c257b20d830 100644 --- a/src/include/planner/abstract_plan.h +++ b/src/include/planner/abstract_plan.h @@ -20,6 +20,8 @@ #include "codegen/query_parameters_map.h" #include "common/printable.h" #include "planner/binding_context.h" +#include "type/serializeio.h" +#include "type/serializer.h" #include "common/internal_types.h" #include "type/value.h" #include "util/hash_util.h" @@ -64,6 +66,8 @@ class AbstractPlan : public Printable { const AbstractPlan *GetChild(uint32_t child_index) const; + const AbstractPlan *GetParent() const; + //===--------------------------------------------------------------------===// // Accessors //===--------------------------------------------------------------------===// @@ -107,6 +111,23 @@ class AbstractPlan : public Printable { virtual std::unique_ptr Copy() const = 0; + // A plan will be sent to anther node via serialization + // So serialization should be implemented by the derived classes + + //===--------------------------------------------------------------------===// + // Serialization/Deserialization + // Each sub-class will have to implement these functions + // After the implementation for each sub-class, we should set these to pure + // virtual + //===--------------------------------------------------------------------===// + virtual bool SerializeTo(SerializeOutput &output UNUSED_ATTRIBUTE) const { + return false; + } + virtual bool DeserializeFrom(SerializeInput &input UNUSED_ATTRIBUTE) { + return false; + } + virtual int SerializeSize() const { return 0; } + virtual hash_t Hash() const; virtual bool operator==(const AbstractPlan &rhs) const; @@ -122,10 +143,16 @@ class AbstractPlan : public Printable { } } + protected: + // only used by its derived classes (when deserialization) + AbstractPlan *Parent() const { return parent_; } + private: // A plan node can have multiple children std::vector> children_; + AbstractPlan *parent_ = nullptr; + // TODO: This field is harded coded now. This needs to be changed when // optimizer has the cost model and cardinality estimation int estimated_cardinality_ = 500000; diff --git a/src/include/planner/abstract_scan_plan.h b/src/include/planner/abstract_scan_plan.h index b770d66b7fe..7241f844c74 100644 --- a/src/include/planner/abstract_scan_plan.h +++ b/src/include/planner/abstract_scan_plan.h @@ -71,6 +71,8 @@ class AbstractScan : public AbstractPlan { protected: void SetTargetTable(storage::DataTable *table) { target_table_ = table; } + void AddColumnId(oid_t col_id) { column_ids_.push_back(col_id); } + void SetPredicate(expression::AbstractExpression *predicate) { predicate_ = std::unique_ptr(predicate); } diff --git a/src/include/planner/seq_scan_plan.h b/src/include/planner/seq_scan_plan.h index fed2f12d783..9f0f411f2cb 100644 --- a/src/include/planner/seq_scan_plan.h +++ b/src/include/planner/seq_scan_plan.h @@ -18,20 +18,10 @@ #include "common/internal_types.h" #include "common/logger.h" -#include "expression/abstract_expression.h" #include "planner/abstract_scan_plan.h" #include "type/serializer.h" namespace peloton { - -namespace expression { -class Parameter; -} // namespace expression - -namespace storage { -class DataTable; -} // namespace storage - namespace planner { class SeqScanPlan : public AbstractScan { @@ -58,6 +48,15 @@ class SeqScanPlan : public AbstractScan { void SetParameterValues(std::vector *values) override; + //===--------------------------------------------------------------------===// + // Serialization/Deserialization + //===--------------------------------------------------------------------===// + bool SerializeTo(SerializeOutput &output) const override; + bool DeserializeFrom(SerializeInput &input) override; + + /* For init SerializeOutput */ + int SerializeSize() const override; + std::unique_ptr Copy() const override { auto *new_plan = new SeqScanPlan(GetTable(), GetPredicate()->Copy(), GetColumnIds()); diff --git a/src/planner/abstract_plan.cpp b/src/planner/abstract_plan.cpp index 49014a6f471..241323bb0e9 100644 --- a/src/planner/abstract_plan.cpp +++ b/src/planner/abstract_plan.cpp @@ -14,6 +14,7 @@ #include "common/logger.h" #include "common/macros.h" +#include "expression/expression_util.h" #include "util/hash_util.h" namespace peloton { @@ -37,6 +38,8 @@ const AbstractPlan *AbstractPlan::GetChild(uint32_t child_index) const { return children_[child_index].get(); } +const AbstractPlan *AbstractPlan::GetParent() const { return parent_; } + // Get a string representation of this plan std::ostream &operator<<(std::ostream &os, const AbstractPlan &plan) { os << PlanNodeTypeToString(plan.GetPlanNodeType()); diff --git a/src/planner/seq_scan_plan.cpp b/src/planner/seq_scan_plan.cpp index 7c3ba3d8a14..62e8299aae7 100644 --- a/src/planner/seq_scan_plan.cpp +++ b/src/planner/seq_scan_plan.cpp @@ -6,21 +6,246 @@ // // Identification: src/planner/seq_scan_plan.cpp // -// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// Copyright (c) 2015-17, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// #include "planner/seq_scan_plan.h" +#include "parser/select_statement.h" +#include "catalog/manager.h" +#include "catalog/schema.h" #include "common/logger.h" #include "common/macros.h" #include "expression/abstract_expression.h" +#include "expression/expression_util.h" #include "storage/data_table.h" +#include "storage/storage_manager.h" #include "common/internal_types.h" namespace peloton { namespace planner { +//===--------------------------------------------------------------------===// +// Serialization/Deserialization +//===--------------------------------------------------------------------===// + +/** + * The SeqScanPlan has the following members: + * database_id, table_id, predicate, column_id, parent(might be NULL) + * TODO: SeqScanPlan doesn't have children, so we don't need to handle it + * + * Therefore a SeqScanPlan is serialized as: + * [(int) total size] + * [(int8_t) plan type] + * [(int) database_id] + * [(int) table_id] + * [(int) num column_id] + * [(int) column id...] + * [(int8_t) expr type] : if invalid, predicate is null + * [(bytes) predicate] : predicate is Expression + * [(int8_t) plan type] : if invalid, parent is null + * [(bytes) parent] : parent is also a plan + * + * TODO: parent_ seems never be set or used + */ + +bool SeqScanPlan::SerializeTo(SerializeOutput &output) const { + // A placeholder for the total size written at the end + int start = output.Position(); + output.WriteInt(-1); + + // Write the SeqScanPlan type + PlanNodeType plan_type = GetPlanNodeType(); + output.WriteByte(static_cast(plan_type)); + + // Write database id and table id + if (!GetTable()) { + // The plan is not completed + return false; + } + oid_t database_id = GetTable()->GetDatabaseOid(); + oid_t table_id = GetTable()->GetOid(); + + output.WriteInt(static_cast(database_id)); + output.WriteInt(static_cast(table_id)); + + // If column has 0 item, just write the columnid_count with 0 + int columnid_count = GetColumnIds().size(); + output.WriteInt(columnid_count); + + // If column has 0 item, nothing happens here + for (int it = 0; it < columnid_count; it++) { + oid_t col_id = GetColumnIds()[it]; + output.WriteInt(static_cast(col_id)); + } + + // Write predicate + if (GetPredicate() == nullptr) { + // Write the type + output.WriteByte(static_cast(ExpressionType::INVALID)); + } else { + // Write the expression type + ExpressionType expr_type = GetPredicate()->GetExpressionType(); + output.WriteByte(static_cast(expr_type)); + } + + // Write parent, but parent seems never be set or used right now + if (GetParent() == nullptr) { + // Write the type + output.WriteByte(static_cast(PlanNodeType::INVALID)); + } else { + // Write the parent type + PlanNodeType parent_type = GetParent()->GetPlanNodeType(); + output.WriteByte(static_cast(parent_type)); + + // Write parent + GetParent()->SerializeTo(output); + } + + // Write the total length + int32_t sz = static_cast(output.Position() - start - sizeof(int)); + PELOTON_ASSERT(sz > 0); + output.WriteIntAt(start, sz); + + return true; +} + +/** + * Therefore a SeqScanPlan is serialized as: + * [(int) total size] + * [(int8_t) plan type] + * [(int) database_id] + * [(int) table_id] + * [(int) num column_id] + * [(int) column id...] + * [(int8_t) expr type] : if invalid, predicate is null + * [(bytes) predicate] : predicate is Expression + * [(int8_t) plan type] : if invalid, parent is null + * [(bytes) parent] : parent is also a plan + */ +bool SeqScanPlan::DeserializeFrom(SerializeInput &input) { + // Read the size of SeqScanPlan class + input.ReadInt(); + + // Read the type + UNUSED_ATTRIBUTE PlanNodeType plan_type = + (PlanNodeType)input.ReadEnumInSingleByte(); + PELOTON_ASSERT(plan_type == GetPlanNodeType()); + + // Read database id + oid_t database_oid = input.ReadInt(); + + // Read table id + oid_t table_oid = input.ReadInt(); + + // Get table and set it to the member + storage::DataTable *target_table = nullptr; + try{ + target_table = static_cast( + storage::StorageManager::GetInstance()->GetTableWithOid( + database_oid, table_oid)); + } catch (CatalogException &e) { + LOG_TRACE("Can't find table %d! Return false", table_oid); + return false; + } + SetTargetTable(target_table); + + // Read the number of column_id and set them to column_ids_ + oid_t columnid_count = input.ReadInt(); + for (oid_t it = 0; it < columnid_count; it++) { + oid_t column_id = input.ReadInt(); + AddColumnId(column_id); + } + + // Read the type + ExpressionType expr_type = (ExpressionType)input.ReadEnumInSingleByte(); + + // Predicate deserialization + if (expr_type != ExpressionType::INVALID) { + switch (expr_type) { + // case ExpressionType::COMPARE_IN: + // predicate_ = + // std::unique_ptr(new + // ComparisonExpression (101)); + // predicate_.DeserializeFrom(input); + // break; + + default: { + LOG_ERROR( + "Expression deserialization :: Unsupported EXPRESSION_TYPE: %s", + ExpressionTypeToString(expr_type).c_str()); + break; + } + } + } + + // Read the type of parent + PlanNodeType parent_type = (PlanNodeType)input.ReadEnumInSingleByte(); + + // Parent deserialization + if (parent_type != PlanNodeType::INVALID) { + switch (expr_type) { + // case ExpressionType::COMPARE_IN: + // predicate_ = + // std::unique_ptr(new + // ComparisonExpression (101)); + // predicate_.DeserializeFrom(input); + // break; + + default: { + LOG_ERROR("Parent deserialization :: Unsupported PlanNodeType: %s", + ExpressionTypeToString(expr_type).c_str()); + break; + } + } + } + + return true; +} +/** + * + * SeqScanPlan is serialized as: + * [(int) total size] + * [(int8_t) plan type] + * [(int) database_id] + * [(int) table_id] + * [(int) num column_id] + * [(int) column id...] + * [(int8_t) expr type] : if invalid, predicate is null + * [(bytes) predicate] : predicate is Expression + * [(int8_t) plan type] : if invalid, parent is null + * [(bytes) parent] : parent is also a plan + * + * So, the fixed size part is: + * [(int) total size] 4 + + * [(int8_t) plan type] 1 + + * [(int) database_id] 4 + + * [(int) table_id] 4 + + * [(int) num column_id]4 + + * [(int8_t) expr type] 1 + + * [(int8_t) plan type] 1 = + * the variant part is : + * [(int) column id...]: num column_id * 4 + * [(bytes) predicate] : predicate->GetSerializeSize() + * [(bytes) parent] : parent->GetSerializeSize() + */ +int SeqScanPlan::SerializeSize() const { + // Fixed size. see the detail above + int size_fix = sizeof(int) * 4 + 3; + int size_column_ids = GetColumnIds().size() * sizeof(int); + int size = size_fix + size_column_ids; + + if (GetPredicate() != nullptr) { + size = size + GetPredicate()->SerializeSize(); + } + if (Parent()) { + size = size + Parent()->SerializeSize(); + } + + return size; +} + void SeqScanPlan::SetParameterValues(std::vector *values) { LOG_TRACE("Setting parameter values in Sequential Scan"); From 13f84a4d71b5b884e68cc9e974e265f3d0d90e5a Mon Sep 17 00:00:00 2001 From: Prashanth Date: Wed, 6 Jun 2018 15:22:19 -0400 Subject: [PATCH 40/42] Beefed up tests, which caught more bugs --- src/codegen/util/csv_scanner.cpp | 186 ++++++++++++++++++------- src/include/codegen/util/csv_scanner.h | 12 +- src/include/util/string_util.h | 13 ++ src/util/file.cpp | 24 ++-- src/util/string_util.cpp | 9 ++ test/codegen/csv_scan_test.cpp | 177 +++++++++++++++-------- 6 files changed, 297 insertions(+), 124 deletions(-) diff --git a/src/codegen/util/csv_scanner.cpp b/src/codegen/util/csv_scanner.cpp index 0481a4444e1..5f09349f973 100644 --- a/src/codegen/util/csv_scanner.cpp +++ b/src/codegen/util/csv_scanner.cpp @@ -32,7 +32,7 @@ CSVScanner::CSVScanner(peloton::type::AbstractPool &pool, file_path_(file_path), file_(), buffer_(nullptr), - buffer_begin_(0), + buffer_pos_(0), buffer_end_(0), line_(nullptr), line_len_(0), @@ -59,12 +59,17 @@ CSVScanner::CSVScanner(peloton::type::AbstractPool &pool, CSVScanner::~CSVScanner() { if (buffer_ != nullptr) { memory_.Free(buffer_); + buffer_ = nullptr; } + if (line_ != nullptr) { memory_.Free(line_); + line_ = nullptr; } + if (cols_ != nullptr) { memory_.Free(cols_); + cols_ = nullptr; } } @@ -90,21 +95,22 @@ void CSVScanner::Produce() { Initialize(); // Loop lines - while (const char *line = NextLine()) { + while (char *line = NextLine()) { ProduceCSV(line); } } void CSVScanner::Initialize() { // Let's first perform a few validity checks - boost::filesystem::path path{file_path_}; + boost::filesystem::path path(file_path_); if (!boost::filesystem::exists(path)) { - throw ExecutorException{StringUtil::Format("input path '%s' does not exist", - file_path_.c_str())}; + throw ExecutorException(StringUtil::Format("input path '%s' does not exist", + file_path_.c_str())); } else if (!boost::filesystem::is_regular_file(file_path_)) { - throw ExecutorException{ - StringUtil::Format("unable to read file '%s'", file_path_.c_str())}; + auto msg = + StringUtil::Format("unable to read file '%s'", file_path_.c_str()); + throw ExecutorException(msg); } // The path looks okay, let's try opening it @@ -125,7 +131,7 @@ void CSVScanner::Initialize() { bool CSVScanner::NextBuffer() { // Do read - buffer_begin_ = 0; + buffer_pos_ = 0; buffer_end_ = static_cast(file_.Read(buffer_, kDefaultBufferSize)); // Update stats @@ -134,7 +140,9 @@ bool CSVScanner::NextBuffer() { return (buffer_end_ != 0); } -void CSVScanner::AppendToCurrentLine(const char *data, uint32_t len) { +void CSVScanner::AppendToLineBuffer(const char *data, uint32_t len) { + PELOTON_ASSERT(len > 0); + // Short-circuit if we're not appending any data if (len == 0) { return; @@ -146,7 +154,7 @@ void CSVScanner::AppendToCurrentLine(const char *data, uint32_t len) { const auto msg = StringUtil::Format( "Line %u in file '%s' exceeds maximum line length: %lu", line_number_ + 1, file_path_.c_str(), kMaxAllocSize); - throw Exception{msg}; + throw Exception(msg); } // The current line buffer isn't large enough to store the new bytes, so we @@ -186,41 +194,44 @@ void CSVScanner::AppendToCurrentLine(const char *data, uint32_t len) { stats_.num_copies++; } -// The main purpose of this function is to find the start of the next line in -// the CSV file. -const char *CSVScanner::NextLine() { +// The objective of this function is to find a complete line in the CSV file. +// The returned value will be a valid pointer to a null-terminated string that +// is the next line in the CSV to be processed. +char *CSVScanner::NextLine() { line_len_ = 0; + const char quote = quote_; + const char escape = (quote_ == escape_ ? static_cast('\0') : escape_); + bool in_quote = false; bool last_was_escape = false; - bool copied_to_line_buf = false; - uint32_t line_end = buffer_begin_; - - char quote = quote_; - char escape = (quote_ == escape_ ? static_cast('\0') : escape_); + const char *buf = buffer_; + uint32_t curr_buffer_pos = buffer_pos_; while (true) { - if (line_end >= buffer_end_) { + if (curr_buffer_pos == buffer_end_) { // We need to read more data from the CSV file. But first, we need to copy // all the data in the read-buffer (i.e., [buffer_begin_, buffer_end_] to // the line-buffer. + if (buffer_pos_ < curr_buffer_pos) { + AppendToLineBuffer(buffer_ + buffer_pos_, + curr_buffer_pos - buffer_pos_); + buffer_pos_ = curr_buffer_pos; + } - AppendToCurrentLine(buffer_ + buffer_begin_, - static_cast(buffer_end_ - buffer_begin_)); + // Reset positions + curr_buffer_pos = 0; // Now, read more data if (!NextBuffer()) { - return nullptr; + // We hit en EOF + break; } - - // Reset positions - line_end = buffer_begin_; - copied_to_line_buf = true; } // Read character - char c = buffer_[line_end]; + char c = buf[curr_buffer_pos++]; if (in_quote && c == escape) { last_was_escape = !last_was_escape; @@ -235,47 +246,120 @@ const char *CSVScanner::NextLine() { // Process the new-line character. If we a new-line and we're not currently // in a quoted section, we're done. if (c == '\n' && !in_quote) { - buffer_[line_end] = '\0'; break; } + } - // Move along - line_end++; + // Flush remaining valid bytes + if (buffer_pos_ < curr_buffer_pos) { + AppendToLineBuffer(buffer_ + buffer_pos_, curr_buffer_pos - buffer_pos_); + buffer_pos_ = curr_buffer_pos; } // Increment line number line_number_++; - if (copied_to_line_buf) { - AppendToCurrentLine(buffer_, line_end); - buffer_begin_ = line_end + 1; - return line_; - } else { - const char *ret = buffer_ + buffer_begin_; - buffer_begin_ = line_end + 1; - return ret; + // If we didn't transfer any bytes to the line buffer, we must have reached an + // EOF. If so, return null indicating there are no more lines. + if (line_len_ == 0) { + return nullptr; } + + // A full line has been transferred to the line buffer, but we also copied the + // newline character. Strip it off now. + line_len_--; + line_[line_len_] = '\0'; + + // Done + return line_; } -void CSVScanner::ProduceCSV(const char *line) { - // At this point, we have a well-formed line. Let's pull out pointers to the - // columns. +void CSVScanner::ProduceCSV(char *line) { + const char delimiter = delimiter_; + const char quote = quote_; + const char escape = escape_; - const auto *iter = line; - for (uint32_t col_idx = 0; col_idx < num_cols_; col_idx++) { - // Start points to the beginning of the column's data value - const char *start = iter; + // The iterator over characters in the line + char *iter = line; - // Eat text until the next delimiter - while (*iter != 0 && *iter != delimiter_) { - iter++; + for (uint32_t col_idx = 0; col_idx < num_cols_; col_idx++) { + char *col_begin = iter; + char *col_end = nullptr; + + // We need to move col_end to the end of the column's data. Along the way, + // we may need to shift data down due to quotes and escapes. Inspired by + // Postgres. + { + char *out = col_begin; + while (true) { + // This first loop looks for either the delimiter character or the end + // of the line, indicating the end of a columns data. It breaks out of + // the loop if a quote character is found. It flows into a second loop + // whose only purpose is to find the end of the quoted section. + while (true) { + char c = *iter++; + + // If we see the delimiter character, or the end of the string, + // finish + if (c == delimiter || c == '\0') { + col_end = out; + iter--; + goto colend; + } + + // If we see a quote character, move to the second loop to find the + // closing quote. + if (c == quote) { + break; + } + + *out++ = c; + } + + while (true) { + char c = *iter++; + + // If we see the end of the line *within* a quoted section, throw + // error + if (c == '\0') { + throw Exception(StringUtil::Format( + "unterminated CSV quoted field at %u", col_idx)); + } + + // If we see an escape character within a quoted section, we need to + // check if the following character is a quote. If so, we must + // escape it + if (c == escape) { + char next = *iter; + if (next == quote || next == escape) { + *out++ = next; + iter++; + continue; + } + } + + // If we see the closing quote, we're done. + if (c == quote) { + break; + } + + *out++ = c; + } + } } - // At this point, iter points to the end of the column's data value + colend: + // If we've reached the of the line, but haven't setup all the columns, then + // we're missing data for the remaining columns and should throw an error. + if (*iter == '\0' && col_idx != (num_cols_ - 1)) { + throw Exception( + StringUtil::Format("missing data for column %u on line %u", + (col_idx + 2), line_number_)); + } // Let's setup the columns - cols_[col_idx].ptr = start; - cols_[col_idx].len = static_cast(iter - start); + cols_[col_idx].ptr = col_begin; + cols_[col_idx].len = static_cast(col_end - col_begin); cols_[col_idx].is_null = (cols_[col_idx].len == 0); // Eat delimiter, moving to next column diff --git a/src/include/codegen/util/csv_scanner.h b/src/include/codegen/util/csv_scanner.h index a946dec903e..f230354c5fa 100644 --- a/src/include/codegen/util/csv_scanner.h +++ b/src/include/codegen/util/csv_scanner.h @@ -158,17 +158,17 @@ class CSVScanner { // Initialize the scan void Initialize(); - // Append bytes to the end of the currently accruing line. - void AppendToCurrentLine(const char *data, uint32_t len); + // Append bytes to the end of the line buffer + void AppendToLineBuffer(const char *data, uint32_t len); // Read the next line from the CSV file - const char *NextLine(); + char *NextLine(); // Read a buffer's worth of data from the CSV file bool NextBuffer(); // Produce CSV data stored in the provided line - void ProduceCSV(const char *line); + void ProduceCSV(char *line); private: // All memory allocations happen from this pool @@ -180,10 +180,10 @@ class CSVScanner { // The CSV file handle peloton::util::File file_; - // The temporary buffer where raw file contents are read into + // The temporary read-buffer where raw file contents are first read into // TODO: make these unique_ptr's with a customer deleter char *buffer_; - uint32_t buffer_begin_; + uint32_t buffer_pos_; uint32_t buffer_end_; // A pointer to the start of a line in the CSV file diff --git a/src/include/util/string_util.h b/src/include/util/string_util.h index d61f297ce09..9882ce3ecd5 100644 --- a/src/include/util/string_util.h +++ b/src/include/util/string_util.h @@ -133,6 +133,19 @@ class StringUtil { static void RTrim(std::string &str); static std::string Indent(const int num_indent); + + /** + * Return a new string that has stripped all occurrences of the provided + * character from the provided string. + * + * NOTE: This function copies the input string into a new string, which is + * wasteful. Don't use this for performance critical code, please! + * + * @param str The input string + * @param c The character we want to remove + * @return A new string with no occurrences of the provided character + */ + static std::string Strip(const std::string &str, char c); }; } // namespace peloton diff --git a/src/util/file.cpp b/src/util/file.cpp index de0835982c8..275d3848418 100644 --- a/src/util/file.cpp +++ b/src/util/file.cpp @@ -42,8 +42,8 @@ void File::Open(const std::string &name, File::AccessMode access_mode) { // Check error if (fd == -1) { - throw Exception{ - StringUtil::Format("Unable to read file '%s'", name.c_str())}; + throw Exception( + StringUtil::Format("unable to read file '%s'", name.c_str())); } // Done @@ -59,8 +59,8 @@ uint64_t File::Read(void *data, uint64_t len) const { // Check error if (bytes_read == -1) { - throw Exception{ - StringUtil::Format("Error reading file: %s", strerror(errno))}; + throw Exception( + StringUtil::Format("error reading file: %s", strerror(errno))); } // Done @@ -76,8 +76,8 @@ uint64_t File::Write(void *data, uint64_t len) const { // Check error if (bytes_written == -1) { - throw Exception{ - StringUtil::Format("Error writing to file: %s", strerror(errno))}; + throw Exception( + StringUtil::Format("error writing to file: %s", strerror(errno))); } // Done @@ -91,23 +91,23 @@ uint64_t File::Size() const { // Save the current position off_t curr_off = lseek(fd_, 0, SEEK_CUR); if (curr_off == -1) { - throw Exception{StringUtil::Format( - "unable to read current position in file: %s", strerror(errno))}; + throw Exception(StringUtil::Format( + "unable to read current position in file: %s", strerror(errno))); } // Seek to the end of the file, returning the new file position i.e., the // size of the file in bytes. off_t off = lseek(fd_, 0, SEEK_END); if (off == -1) { - throw Exception{StringUtil::Format( - "unable to move file position to end file: %s", strerror(errno))}; + throw Exception(StringUtil::Format( + "unable to move file position to end file: %s", strerror(errno))); } off_t restore = lseek(fd_, curr_off, SEEK_SET); if (restore == -1) { - throw Exception{StringUtil::Format( + throw Exception(StringUtil::Format( "unable to restore position after moving to the end: %s", - strerror(errno))}; + strerror(errno))); } // Restore position diff --git a/src/util/string_util.cpp b/src/util/string_util.cpp index d4fca199219..a0f8ba3987f 100644 --- a/src/util/string_util.cpp +++ b/src/util/string_util.cpp @@ -190,4 +190,13 @@ std::vector StringUtil::Split(const std::string &input, } return splits; } + +std::string StringUtil::Strip(const std::string &str, char c) { + // There's a copy here which is wasteful, so don't use this in performance + // critical code! + std::string tmp = str; + tmp.erase(std::remove(tmp.begin(), tmp.end(), c), tmp.end()); + return tmp; } + +} // namespace peloton diff --git a/test/codegen/csv_scan_test.cpp b/test/codegen/csv_scan_test.cpp index 2cebff0873e..127e73b968f 100644 --- a/test/codegen/csv_scan_test.cpp +++ b/test/codegen/csv_scan_test.cpp @@ -14,10 +14,8 @@ #include "codegen/util/csv_scanner.h" #include "common/timer.h" -#include "function/date_functions.h" -#include "function/numeric_functions.h" -#include "function/string_functions.h" #include "util/file_util.h" +#include "util/string_util.h" namespace peloton { namespace test { @@ -39,14 +37,15 @@ void CSVRowCallback(void *s) { void IterateAsCSV(const std::vector &rows, const std::vector &col_types, - CallbackFn callback, char delimiter = ',') { + CallbackFn callback, char delimiter = ',', char quote = '"', + char escape = '"') { std::string csv_data; - for (uint32_t i = 0; i < rows.size(); i++) { - csv_data.append(rows[i]).append("\n"); + for (const auto &row : rows) { + csv_data.append(row).append("\n"); } // Write the contents into a temporary file - TempFileHandle fh{FileUtil::WriteTempFile(csv_data, "", "tmp")}; + TempFileHandle fh(FileUtil::WriteTempFile(csv_data, "", "tmp")); // The memory pool auto &pool = *TestingHarness::GetInstance().GetTestingPool(); @@ -55,9 +54,10 @@ void IterateAsCSV(const std::vector &rows, State state = {.scanner = nullptr, .callback = callback}; // The scanner - codegen::util::CSVScanner scanner{ + codegen::util::CSVScanner scanner( pool, fh.name, col_types.data(), static_cast(col_types.size()), - CSVRowCallback, reinterpret_cast(&state), delimiter}; + CSVRowCallback, reinterpret_cast(&state), delimiter, quote, + escape); state.scanner = &scanner; @@ -65,8 +65,8 @@ void IterateAsCSV(const std::vector &rows, scanner.Produce(); } -TEST_F(CSVScanTest, SimpleNumericScan) { - // Create a temporary CSV file +TEST_F(CSVScanTest, NumericScanTest) { + // The set of test rows and their types std::vector rows = {"1,2,3.0,4", "4,5,6.0,7", "8,9,10.0,11"}; std::vector types = {{type::TypeId::INTEGER, false}, {type::TypeId::INTEGER, false}, @@ -74,73 +74,140 @@ TEST_F(CSVScanTest, SimpleNumericScan) { {type::TypeId::INTEGER, false}}; uint32_t rows_read = 0; - IterateAsCSV(rows, types, [&rows_read, &types]( + IterateAsCSV(rows, types, [&rows, &rows_read, &types]( const codegen::util::CSVScanner::Column *cols) { - rows_read++; + // Split the input row into column values + const auto input_parts = StringUtil::Split(rows[rows_read++], ','); + + // Check contents of row based on test input + for (uint32_t i = 0; i < types.size(); i++) { + // The column isn't null + EXPECT_FALSE(cols[i].is_null); + + // The column has a value + EXPECT_GT(cols[i].len, 0); + + // Check the string representations + EXPECT_EQ(input_parts[i], std::string(cols[i].ptr, cols[i].len)); + } + }); + + EXPECT_EQ(rows.size(), rows_read); +} + +TEST_F(CSVScanTest, QuoteEscapeTest) { + // The set of test rows and their types + std::vector rows = {"yea he's \"cool\",1,2", "a quote:\"\",3,4"}; + std::vector types = {{type::TypeId::VARCHAR, false}, + {type::TypeId::INTEGER, false}, + {type::TypeId::INTEGER, false}}; + + uint32_t rows_read = 0; + IterateAsCSV(rows, types, [&rows, &rows_read, &types]( + const codegen::util::CSVScanner::Column *cols) { + // Split the input row into column values + auto input_parts = StringUtil::Split(rows[rows_read++], ','); + + // Check contents of row based on test input for (uint32_t i = 0; i < types.size(); i++) { + // The column isn't null EXPECT_FALSE(cols[i].is_null); + + // The column has a value EXPECT_GT(cols[i].len, 0); + + // Check the string representations. We need to strip off any quotes from + // the original string since the CSV scan will strip them for us. + EXPECT_EQ(StringUtil::Strip(input_parts[i], '"'), + std::string(cols[i].ptr, cols[i].len)); } }); - // Check EXPECT_EQ(rows.size(), rows_read); } -TEST_F(CSVScanTest, MixedStringScan) { - // Create a temporary CSV file +TEST_F(CSVScanTest, MixedStringTest) { std::vector rows = { - "1,1994-01-01,3,test", "4,2018-01-01,6,\"test\"", + "1,1994-01-01,3,test", "4,2018-01-01,6,\"quoted_test\"", "8,2016-05-05,10,\"test\nnewline\ninquote\""}; std::vector types = {{type::TypeId::INTEGER, false}, {type::TypeId::DATE, false}, {type::TypeId::INTEGER, false}, {type::TypeId::VARCHAR, false}}; - - std::vector rows_read; - IterateAsCSV(rows, types, [&rows_read, &types]( + uint32_t rows_read = 0; + IterateAsCSV(rows, types, [&rows, &rows_read, &types]( const codegen::util::CSVScanner::Column *cols) { - std::string row; + // Split the input row into column values + auto input_parts = StringUtil::Split(rows[rows_read++], ','); + for (uint32_t i = 0; i < types.size(); i++) { + // The column isn't null EXPECT_FALSE(cols[i].is_null); + + // The column has a value EXPECT_GT(cols[i].len, 0); - if (i > 0) row.append(","); - switch (types[i].type_id) { - case type::TypeId::INTEGER: { - row.append(std::to_string(function::NumericFunctions::InputInteger( - types[i], cols[i].ptr, cols[i].len))); - break; - } - case type::TypeId::DATE: { - auto raw_date = function::DateFunctions::InputDate( - types[i], cols[i].ptr, cols[i].len); - int32_t year, month, day; - function::DateFunctions::JulianToDate(raw_date, year, month, day); - row.append(StringUtil::Format("%u-%02u-%02u", year, month, day)); - break; - } - case type::TypeId::VARCHAR: { - auto ret = function::StringFunctions::InputString( - types[i], cols[i].ptr, cols[i].len); - row.append(std::string{ret.str, ret.length - 1}); - break; - } - default: { - throw Exception{StringUtil::Format( - "Did not expect column type '%s' in test. Did you forget to " - "modify the switch statement to handle a column type you've added" - "in the test case?", - TypeIdToString(types[i].type_id).c_str())}; - } - } + + // Check the string representations. We need to strip off any quotes from + // the original string since the CSV scan will strip them for us. + EXPECT_EQ(StringUtil::Strip(input_parts[i], '"'), + std::string(cols[i].ptr, cols[i].len)); } - rows_read.push_back(row); }); - // Check - ASSERT_EQ(rows.size(), rows_read.size()); - for (uint32_t i = 0; i < rows.size(); i++) { - EXPECT_EQ(rows[i], rows_read[i]); + EXPECT_EQ(rows.size(), rows_read); +} + +TEST_F(CSVScanTest, CatchErrorsTest) { + //////////////////////////////////////////////////////////////////// + /// + /// Test Case - Missing last column + /// + //////////////////////////////////////////////////////////////////// + { + std::vector missing_col = {"1,1994-01-01,3"}; + std::vector types = {{type::TypeId::INTEGER, false}, + {type::TypeId::DATE, false}, + {type::TypeId::INTEGER, false}, + {type::TypeId::VARCHAR, false}}; + EXPECT_ANY_THROW(IterateAsCSV( + missing_col, types, + [](UNUSED_ATTRIBUTE const codegen::util::CSVScanner::Column *cols) { + FAIL(); + })); + } + + //////////////////////////////////////////////////////////////////// + /// + /// Test Case - Unclosed quote + /// + //////////////////////////////////////////////////////////////////// + { + std::vector missing_col = {"1,\"unclosed,3"}; + std::vector types = {{type::TypeId::INTEGER, false}, + {type::TypeId::VARCHAR, false}, + {type::TypeId::INTEGER, false}}; + EXPECT_ANY_THROW(IterateAsCSV( + missing_col, types, + [](UNUSED_ATTRIBUTE const codegen::util::CSVScanner::Column *cols) { + FAIL(); + })); + } + + //////////////////////////////////////////////////////////////////// + /// + /// Test Case - Unclosed quote + /// + //////////////////////////////////////////////////////////////////// + { + std::vector missing_col = {"1,unclosed\",3"}; + std::vector types = {{type::TypeId::INTEGER, false}, + {type::TypeId::VARCHAR, false}, + {type::TypeId::INTEGER, false}}; + EXPECT_ANY_THROW(IterateAsCSV( + missing_col, types, + [](UNUSED_ATTRIBUTE const codegen::util::CSVScanner::Column *cols) { + FAIL(); + })); } } From bca783db58dd56cd8544a171f52031d85540b29b Mon Sep 17 00:00:00 2001 From: Prashanth Date: Wed, 6 Jun 2018 16:14:38 -0400 Subject: [PATCH 41/42] Fix tests --- test/codegen/csv_scan_translator_test.cpp | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/test/codegen/csv_scan_translator_test.cpp b/test/codegen/csv_scan_translator_test.cpp index 66da8ead0d5..320db518117 100644 --- a/test/codegen/csv_scan_translator_test.cpp +++ b/test/codegen/csv_scan_translator_test.cpp @@ -16,6 +16,7 @@ #include "planner/csv_scan_plan.h" #include "planner/insert_plan.h" #include "planner/seq_scan_plan.h" +#include "util/string_util.h" #include "util/file_util.h" namespace peloton { @@ -33,10 +34,21 @@ class CSVScanTranslatorTest : public PelotonCodeGenTest { }; TEST_F(CSVScanTranslatorTest, IntCsvScan) { - // Test input - std::vector rows = {"1,2,3.9,four", - "5,6,7.4,eight", - "9,10,11.1,\"twelve\""}; + // The quoting character and a helper function to quote a given string + const char quote = '"'; + const auto quote_string = [quote](std::string s) { + return StringUtil::Format("%c%s%c", quote, s.c_str(), quote); + }; + + // Test input rows + // clang-format off + std::vector rows = { + "1,2,3.9,four", + "5,6,7.4,eight", + "9,10,11.1," + quote_string("twelve"), + "14,15,16.7,eighteen " + quote_string("nineteen") + " twenty " + quote_string("twenty-one")}; + // clang-format on + std::string csv_data; for (const auto &row : rows) { csv_data.append(row).append("\n"); @@ -93,7 +105,7 @@ TEST_F(CSVScanTranslatorTest, IntCsvScan) { const auto &output = consumer.GetOutputTuples(); ASSERT_EQ(rows.size(), output.size()); for (uint32_t i = 0; i < rows.size(); i++) { - EXPECT_EQ(rows[i], output[i].ToCSV()); + EXPECT_EQ(StringUtil::Strip(rows[i], '"'), output[i].ToCSV()); } } } From e327ac7e7ef737d5ee2f5e3d01dc7ec52f70eed6 Mon Sep 17 00:00:00 2001 From: Prashanth Date: Wed, 6 Jun 2018 16:16:19 -0400 Subject: [PATCH 42/42] Reducing copying overhead for columns, constraints and loop variables during CheckConstraints(). We were spending 50% of our time here during bulk insertions into wide tables due to unnecessary copying! --- src/include/catalog/schema.h | 2 +- src/storage/data_table.cpp | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/include/catalog/schema.h b/src/include/catalog/schema.h index 43a62d6444f..2f6875b453b 100644 --- a/src/include/catalog/schema.h +++ b/src/include/catalog/schema.h @@ -126,7 +126,7 @@ class Schema : public Printable { return columns[column_id].IsInlined(); } - inline const Column GetColumn(const oid_t column_id) const { + inline const Column &GetColumn(const oid_t column_id) const { return columns[column_id]; } diff --git a/src/storage/data_table.cpp b/src/storage/data_table.cpp index 1fd81b76865..3660fcc2f79 100644 --- a/src/storage/data_table.cpp +++ b/src/storage/data_table.cpp @@ -155,12 +155,12 @@ bool DataTable::CheckConstraints(const AbstractTuple *tuple) const { // column. Like maybe can store a list of just columns that // even have constraints defined so that we don't have to // look at each column individually. - oid_t column_count = schema->GetColumnCount(); + size_t column_count = schema->GetColumnCount(); for (oid_t column_itr = 0; column_itr < column_count; column_itr++) { - std::vector column_cons = + const std::vector &column_constraints = schema->GetColumn(column_itr).GetConstraints(); - for (auto cons : column_cons) { - ConstraintType type = cons.GetType(); + for (const auto &constraint : column_constraints) { + ConstraintType type = constraint.GetType(); switch (type) { case ConstraintType::NOTNULL: { if (CheckNotNulls(tuple, column_itr) == false) { @@ -208,9 +208,9 @@ bool DataTable::CheckConstraints(const AbstractTuple *tuple) const { LOG_TRACE("%s", error.c_str()); throw ConstraintException(error); } - } // SWITCH - } // FOR (constraints) - } // FOR (columns) + } + } + } return true; }