源码简要分析

分析于2024.8.2

国内git recursive拖不下来,只能手动下载下来自己组装 (使用Docker更省事😋教程在第二章)

vendored: 提供PG解析器,md5,xxhash相关组件

tools:杂项,包括了Docker构建的过程,Python Package构建,MLIR基础设施,便于调试的Source Map,mlir-tools

建议先从mlir-tools看起,SQL->MLIR有意思,这里先贴一段,有空闲时问下大模型

解释下面C++代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#include "frontend/SQL/Parser.h"
#include "mlir/Dialect/SubOperator/SubOperatorDialect.h"
#include "mlir/Dialect/SubOperator/SubOperatorOps.h"
#include "runtime/Session.h"
void printMLIR(std::string sql, std::shared_ptr<runtime::Catalog> catalog) {
mlir::MLIRContext context;
mlir::DialectRegistry registry;
registry.insert<mlir::BuiltinDialect>();
registry.insert<mlir::relalg::RelAlgDialect>();
registry.insert<mlir::subop::SubOperatorDialect>();
registry.insert<mlir::tuples::TupleStreamDialect>();
registry.insert<mlir::db::DBDialect>();
registry.insert<mlir::func::FuncDialect>();
registry.insert<mlir::arith::ArithDialect>();

registry.insert<mlir::memref::MemRefDialect>();
registry.insert<mlir::util::UtilDialect>();
registry.insert<mlir::scf::SCFDialect>();
registry.insert<mlir::LLVM::LLVMDialect>();
context.appendDialectRegistry(registry);
context.loadAllAvailableDialects();
context.loadDialect<mlir::relalg::RelAlgDialect>();
mlir::OpBuilder builder(&context);
mlir::ModuleOp moduleOp = builder.create<mlir::ModuleOp>(builder.getUnknownLoc());
frontend::sql::Parser translator(sql, *catalog, moduleOp);

builder.setInsertionPointToStart(moduleOp.getBody());
auto* queryBlock = new mlir::Block;
{
mlir::OpBuilder::InsertionGuard guard(builder);
builder.setInsertionPointToStart(queryBlock);
auto val = translator.translate(builder);
if (val.has_value()) {
builder.create<mlir::subop::SetResultOp>(builder.getUnknownLoc(), 0, val.value());
}
builder.create<mlir::func::ReturnOp>(builder.getUnknownLoc());
}
mlir::func::FuncOp funcOp = builder.create<mlir::func::FuncOp>(builder.getUnknownLoc(), "main", builder.getFunctionType({}, {}));
funcOp.getBody().push_back(queryBlock);

mlir::OpPrintingFlags flags;
flags.assumeVerified();
moduleOp->print(llvm::outs(), flags);
}
int main(int argc, char** argv) {
std::string filename = std::string(argv[1]);
auto catalog = runtime::Catalog::createEmpty();
if (argc >= 3) {
std::string dbDir = std::string(argv[2]);
catalog = runtime::DBCatalog::create(catalog, dbDir, false);
}
std::ifstream istream{filename};
std::stringstream buffer;
buffer << istream.rdbuf();
while (true) {
std::stringstream query;
std::string line;
std::getline(buffer, line);
while (true) {
if (!buffer.good()) {
if (buffer.eof()) {
query << line << std::endl;
}
break;
}
query << line << std::endl;
if (!line.empty() && line.find(';') == line.size() - 1) {
break;
}
std::getline(buffer, line);
}
printMLIR(query.str(),catalog);
if (buffer.eof()) {
//exit from repl loop
break;
}
}
return 0;
}

test: 对于Dialect实现的lit(LLVM Integrated Tester)的mlir与sqlite的数据输入

resources:SQL数据与Apache Arrow的相关数据

llvm-project: 关于LLVM的Submodule

TUM居然自己修改了一版LLVM!!!确实是研究型数据库🥵(看commit时间应该介于LLVM17-LLVM18之间)

image-20240802181800740

lib: 项目实现的静态文件

Conversion实现论文的层级流转

include:项目的库文件

utility下面是Tracer的相关实现文件

fronetend下面有SQL解析的头文件

execution涉及SQL运行细节(使用Arrow Compute实现,并用Intel的OneAPI进行加速?)

runtime有很多细节实现

runtime/Catalog.h涉及文件读取

runtime/Realtion.h涉及文件关系,Arrow读取则依赖PyArrow的库

让我惊讶的是:.venv/lib/python3.10/site-packages下面的库也被调用了!(即PyArrow包装的C++库)

eval有相关操作的实现

parser.cpp

非常经典的Parser,使用Enum量构建AST树

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
std::optional<mlir::Value> frontend::sql::Parser::translate(mlir::OpBuilder& builder) {
if (result.tree && result.tree->length == 1) {
auto* statement = static_cast<Node*>(result.tree->head->data.ptr_value);
switch (statement->type) {
case T_VariableSetStmt: {
auto* variableSetStatement = reinterpret_cast<VariableSetStmt*>(statement);
translateVariableSetStatement(builder, variableSetStatement);
break;
}
case T_CreateStmt: {
translateCreateStatement(builder, reinterpret_cast<CreateStmt*>(statement));
break;
}
case T_CopyStmt: {
auto* copyStatement = reinterpret_cast<CopyStmt*>(statement);
translateCopyStatement(builder, copyStatement);
break;
}
case T_SelectStmt: {
parallelismAllowed = true;
TranslationContext context;
auto scope = context.createResolverScope();
auto [tree, targetInfo] = translateSelectStmt(builder, reinterpret_cast<SelectStmt*>(statement), context, scope);
//::mlir::Type result, ::mlir::Value rel, ::mlir::ArrayAttr attrs, ::mlir::ArrayAttr columns
std::vector<mlir::Attribute> attrs;
std::vector<mlir::Attribute> names;
std::vector<mlir::Attribute> colMemberNames;
std::vector<mlir::Attribute> colTypes;
auto& memberManager = builder.getContext()->getLoadedDialect<mlir::subop::SubOperatorDialect>()->getMemberManager();
for (auto x : targetInfo.namedResults) {
if (x.first == "primaryKeyHashValue") continue;
names.push_back(builder.getStringAttr(x.first));
auto colMemberName = memberManager.getUniqueMember(x.first.empty() ? "unnamed" : x.first);
auto columnType = x.second->type;
attrs.push_back(attrManager.createRef(x.second));
colTypes.push_back(mlir::TypeAttr::get(columnType));
colMemberNames.push_back(builder.getStringAttr(colMemberName));
}
auto resultTableType = mlir::subop::ResultTableType::get(builder.getContext(), mlir::subop::StateMembersAttr::get(builder.getContext(), builder.getArrayAttr(colMemberNames), builder.getArrayAttr(colTypes)));
return builder.create<mlir::relalg::MaterializeOp>(builder.getUnknownLoc(), resultTableType, tree, builder.getArrayAttr(attrs), builder.getArrayAttr(names));
}
case T_InsertStmt: {
translateInsertStmt(builder, reinterpret_cast<InsertStmt*>(statement));
break;
}
default:
throw std::runtime_error("unsupported statement type");
}
}
return {};
}

Docker镜像运行

编译环境是不可能折腾的!

有Docker的话当然用Docker🤣

1
2
docker pull ghcr.io/lingo-db/lingo-db:latest
docker run -it --name lingo ghcr.io/lingo-db/lingo-db /bin/bash

编译好的文件默认在build/lingodb-release

貌似跑了TCP-DS?

有一个Lingodbllvm在Python环境里面,所以LingoProject下面是空的

修改下官网给的测试(Data部分需要从外部拷入

c51ad1708246c6cfd0d992b74331db9修改后的shell:

1
2
3
4
5
6
7
echo "select * from studenten where name='Carnap'" > test.sql
./sql-to-mlir test.sql /repo/resources/data/uni/ > canonical.mlir
./mlir-db-opt --use-db /repo/resources/data/uni/ --relalg-query-opt canonical.mlir > optimized.mlir
./mlir-db-opt --lower-relalg-to-subop optimized.mlir > subop.mlir
./mlir-db-opt --lower-subop subop.mlir > hl-imperative.mlir
./mlir-db-opt --lower-db hl-imperative.mlir > ml-imperative.mlir
./mlir-db-opt --lower-dsa ml-imperative.mlir > ll-imperative.mlir

制作Debug版本

内存建议大于8GB,编译中出现的任何错误一律按内存不够大处理(问就是试过了)😅

你也可以使用我打包好的的Docker Image:

docker pull ccr.ccs.tencentyun.com/mocusz/lingo-debug

编译完后,安装GDB

1
apt install gdb

配置VScode的Launch.json文件(供参考,有需要的自己添加,主要是argcwd)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "(gdb) Launch sql-to-mlir",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceFolder}/build/lingodb-debug/sql-to-mlir",
"args": ["./build/lingodb-debug/test.sql","./resources/data/uni/"],
"stopAtEntry": false,
"cwd": "${workspaceFolder}",
"environment": [],
"externalConsole": false,
"MIMode": "gdb",
"setupCommands": [
{
"description": "Enable pretty-printing for gdb",
"text": "-enable-pretty-printing",
"ignoreFailures": true
},
{
"description": "Set Disassembly Flavor to Intel",
"text": "-gdb-set disassembly-flavor intel",
"ignoreFailures": true
}
]
},
]
}

效果如图:

image-20240806162949102