源码简要分析
分析于2024.8.2
国内git recursive拖不下来,只能手动下载下来自己组装 (使用Docker更省事😋教程在第二章)
vendored: 提供PG解析器,md5,xxhash相关组件
tools:杂项,包括了Docker构建的过程,Python Package构建,MLIR基础设施,便于调试的Source Map,mlir-tools
建议先从mlir-tools看起,SQL->MLIR有意思,这里先贴一段,有空闲时问下大模型
解释下面C++代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
| #include "frontend/SQL/Parser.h" #include "mlir/Dialect/SubOperator/SubOperatorDialect.h" #include "mlir/Dialect/SubOperator/SubOperatorOps.h" #include "runtime/Session.h" void printMLIR(std::string sql, std::shared_ptr<runtime::Catalog> catalog) { mlir::MLIRContext context; mlir::DialectRegistry registry; registry.insert<mlir::BuiltinDialect>(); registry.insert<mlir::relalg::RelAlgDialect>(); registry.insert<mlir::subop::SubOperatorDialect>(); registry.insert<mlir::tuples::TupleStreamDialect>(); registry.insert<mlir::db::DBDialect>(); registry.insert<mlir::func::FuncDialect>(); registry.insert<mlir::arith::ArithDialect>();
registry.insert<mlir::memref::MemRefDialect>(); registry.insert<mlir::util::UtilDialect>(); registry.insert<mlir::scf::SCFDialect>(); registry.insert<mlir::LLVM::LLVMDialect>(); context.appendDialectRegistry(registry); context.loadAllAvailableDialects(); context.loadDialect<mlir::relalg::RelAlgDialect>(); mlir::OpBuilder builder(&context); mlir::ModuleOp moduleOp = builder.create<mlir::ModuleOp>(builder.getUnknownLoc()); frontend::sql::Parser translator(sql, *catalog, moduleOp);
builder.setInsertionPointToStart(moduleOp.getBody()); auto* queryBlock = new mlir::Block; { mlir::OpBuilder::InsertionGuard guard(builder); builder.setInsertionPointToStart(queryBlock); auto val = translator.translate(builder); if (val.has_value()) { builder.create<mlir::subop::SetResultOp>(builder.getUnknownLoc(), 0, val.value()); } builder.create<mlir::func::ReturnOp>(builder.getUnknownLoc()); } mlir::func::FuncOp funcOp = builder.create<mlir::func::FuncOp>(builder.getUnknownLoc(), "main", builder.getFunctionType({}, {})); funcOp.getBody().push_back(queryBlock);
mlir::OpPrintingFlags flags; flags.assumeVerified(); moduleOp->print(llvm::outs(), flags); } int main(int argc, char** argv) { std::string filename = std::string(argv[1]); auto catalog = runtime::Catalog::createEmpty(); if (argc >= 3) { std::string dbDir = std::string(argv[2]); catalog = runtime::DBCatalog::create(catalog, dbDir, false); } std::ifstream istream{filename}; std::stringstream buffer; buffer << istream.rdbuf(); while (true) { std::stringstream query; std::string line; std::getline(buffer, line); while (true) { if (!buffer.good()) { if (buffer.eof()) { query << line << std::endl; } break; } query << line << std::endl; if (!line.empty() && line.find(';') == line.size() - 1) { break; } std::getline(buffer, line); } printMLIR(query.str(),catalog); if (buffer.eof()) { break; } } return 0; }
|
test: 对于Dialect实现的lit(LLVM Integrated Tester)的mlir与sqlite的数据输入
resources:SQL数据与Apache Arrow的相关数据
llvm-project: 关于LLVM的Submodule
TUM居然自己修改了一版LLVM!!!确实是研究型数据库🥵(看commit时间应该介于LLVM17-LLVM18之间)
lib: 项目实现的静态文件
Conversion实现论文的层级流转
include:项目的库文件
utility下面是Tracer的相关实现文件
fronetend下面有SQL解析的头文件
execution涉及SQL运行细节(使用Arrow Compute实现,并用Intel的OneAPI进行加速?)
runtime有很多细节实现
runtime/Catalog.h涉及文件读取
runtime/Realtion.h涉及文件关系,Arrow读取则依赖PyArrow的库
让我惊讶的是:.venv/lib/python3.10/site-packages
下面的库也被调用了!(即PyArrow包装的C++库)
eval有相关操作的实现
parser.cpp
非常经典的Parser,使用Enum量构建AST树
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
| std::optional<mlir::Value> frontend::sql::Parser::translate(mlir::OpBuilder& builder) { if (result.tree && result.tree->length == 1) { auto* statement = static_cast<Node*>(result.tree->head->data.ptr_value); switch (statement->type) { case T_VariableSetStmt: { auto* variableSetStatement = reinterpret_cast<VariableSetStmt*>(statement); translateVariableSetStatement(builder, variableSetStatement); break; } case T_CreateStmt: { translateCreateStatement(builder, reinterpret_cast<CreateStmt*>(statement)); break; } case T_CopyStmt: { auto* copyStatement = reinterpret_cast<CopyStmt*>(statement); translateCopyStatement(builder, copyStatement); break; } case T_SelectStmt: { parallelismAllowed = true; TranslationContext context; auto scope = context.createResolverScope(); auto [tree, targetInfo] = translateSelectStmt(builder, reinterpret_cast<SelectStmt*>(statement), context, scope); std::vector<mlir::Attribute> attrs; std::vector<mlir::Attribute> names; std::vector<mlir::Attribute> colMemberNames; std::vector<mlir::Attribute> colTypes; auto& memberManager = builder.getContext()->getLoadedDialect<mlir::subop::SubOperatorDialect>()->getMemberManager(); for (auto x : targetInfo.namedResults) { if (x.first == "primaryKeyHashValue") continue; names.push_back(builder.getStringAttr(x.first)); auto colMemberName = memberManager.getUniqueMember(x.first.empty() ? "unnamed" : x.first); auto columnType = x.second->type; attrs.push_back(attrManager.createRef(x.second)); colTypes.push_back(mlir::TypeAttr::get(columnType)); colMemberNames.push_back(builder.getStringAttr(colMemberName)); } auto resultTableType = mlir::subop::ResultTableType::get(builder.getContext(), mlir::subop::StateMembersAttr::get(builder.getContext(), builder.getArrayAttr(colMemberNames), builder.getArrayAttr(colTypes))); return builder.create<mlir::relalg::MaterializeOp>(builder.getUnknownLoc(), resultTableType, tree, builder.getArrayAttr(attrs), builder.getArrayAttr(names)); } case T_InsertStmt: { translateInsertStmt(builder, reinterpret_cast<InsertStmt*>(statement)); break; } default: throw std::runtime_error("unsupported statement type"); } } return {}; }
|
Docker镜像运行
编译环境是不可能折腾的!
有Docker的话当然用Docker🤣
1 2
| docker pull ghcr.io/lingo-db/lingo-db:latest docker run -it --name lingo ghcr.io/lingo-db/lingo-db /bin/bash
|
编译好的文件默认在build/lingodb-release
貌似跑了TCP-DS?
有一个Lingodbllvm在Python环境里面,所以LingoProject下面是空的
修改下官网给的测试(Data部分需要从外部拷入)
修改后的shell:
1 2 3 4 5 6 7
| echo "select * from studenten where name='Carnap'" > test.sql ./sql-to-mlir test.sql /repo/resources/data/uni/ > canonical.mlir ./mlir-db-opt --use-db /repo/resources/data/uni/ --relalg-query-opt canonical.mlir > optimized.mlir ./mlir-db-opt --lower-relalg-to-subop optimized.mlir > subop.mlir ./mlir-db-opt --lower-subop subop.mlir > hl-imperative.mlir ./mlir-db-opt --lower-db hl-imperative.mlir > ml-imperative.mlir ./mlir-db-opt --lower-dsa ml-imperative.mlir > ll-imperative.mlir
|
制作Debug版本
内存建议大于8GB,编译中出现的任何错误一律按内存不够大处理(问就是试过了)😅
你也可以使用我打包好的的Docker Image:
docker pull ccr.ccs.tencentyun.com/mocusz/lingo-debug
编译完后,安装GDB
配置VScode的Launch.json
文件(供参考,有需要的自己添加,主要是arg
和cwd
)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
| { "version": "0.2.0", "configurations": [ { "name": "(gdb) Launch sql-to-mlir", "type": "cppdbg", "request": "launch", "program": "${workspaceFolder}/build/lingodb-debug/sql-to-mlir", "args": ["./build/lingodb-debug/test.sql","./resources/data/uni/"], "stopAtEntry": false, "cwd": "${workspaceFolder}", "environment": [], "externalConsole": false, "MIMode": "gdb", "setupCommands": [ { "description": "Enable pretty-printing for gdb", "text": "-enable-pretty-printing", "ignoreFailures": true }, { "description": "Set Disassembly Flavor to Intel", "text": "-gdb-set disassembly-flavor intel", "ignoreFailures": true } ] }, ] }
|
效果如图: