源码简要分析

分析于2024.8.2

国内git recursive拖不下来,只能手动下载下来自己组装 (使用Docker更省事😋教程在第二章)

vendored: 提供PG解析器,md5,xxhash相关组件

tools:杂项,包括了Docker构建的过程,Python Package构建,MLIR基础设施,便于调试的Source Map,mlir-tools

建议先从mlir-tools看起,SQL->MLIR有意思,这里先贴一段,有空闲时问下大模型

解释下面C++代码

#include "frontend/SQL/Parser.h"
#include "mlir/Dialect/SubOperator/SubOperatorDialect.h"
#include "mlir/Dialect/SubOperator/SubOperatorOps.h"
#include "runtime/Session.h"
void printMLIR(std::string sql, std::shared_ptr<runtime::Catalog> catalog) {
   mlir::MLIRContext context;
   mlir::DialectRegistry registry;
   registry.insert<mlir::BuiltinDialect>();
   registry.insert<mlir::relalg::RelAlgDialect>();
   registry.insert<mlir::subop::SubOperatorDialect>();
   registry.insert<mlir::tuples::TupleStreamDialect>();
   registry.insert<mlir::db::DBDialect>();
   registry.insert<mlir::func::FuncDialect>();
   registry.insert<mlir::arith::ArithDialect>();

   registry.insert<mlir::memref::MemRefDialect>();
   registry.insert<mlir::util::UtilDialect>();
   registry.insert<mlir::scf::SCFDialect>();
   registry.insert<mlir::LLVM::LLVMDialect>();
   context.appendDialectRegistry(registry);
   context.loadAllAvailableDialects();
   context.loadDialect<mlir::relalg::RelAlgDialect>();
   mlir::OpBuilder builder(&context);
   mlir::ModuleOp moduleOp = builder.create<mlir::ModuleOp>(builder.getUnknownLoc());
   frontend::sql::Parser translator(sql, *catalog, moduleOp);

   builder.setInsertionPointToStart(moduleOp.getBody());
   auto* queryBlock = new mlir::Block;
   {
      mlir::OpBuilder::InsertionGuard guard(builder);
      builder.setInsertionPointToStart(queryBlock);
      auto val = translator.translate(builder);
      if (val.has_value()) {
         builder.create<mlir::subop::SetResultOp>(builder.getUnknownLoc(), 0, val.value());
      }
      builder.create<mlir::func::ReturnOp>(builder.getUnknownLoc());
   }
   mlir::func::FuncOp funcOp = builder.create<mlir::func::FuncOp>(builder.getUnknownLoc(), "main", builder.getFunctionType({}, {}));
   funcOp.getBody().push_back(queryBlock);

   mlir::OpPrintingFlags flags;
   flags.assumeVerified();
   moduleOp->print(llvm::outs(), flags);
}
int main(int argc, char** argv) {
   std::string filename = std::string(argv[1]);
   auto catalog = runtime::Catalog::createEmpty();
   if (argc >= 3) {
      std::string dbDir = std::string(argv[2]);
      catalog = runtime::DBCatalog::create(catalog, dbDir, false);
   }
   std::ifstream istream{filename};
   std::stringstream buffer;
   buffer << istream.rdbuf();
   while (true) {
      std::stringstream query;
      std::string line;
      std::getline(buffer, line);
      while (true) {
         if (!buffer.good()) {
            if (buffer.eof()) {
               query << line << std::endl;
            }
            break;
         }
         query << line << std::endl;
         if (!line.empty() && line.find(';') == line.size() - 1) {
            break;
         }
         std::getline(buffer, line);
      }
      printMLIR(query.str(),catalog);
      if (buffer.eof()) {
         //exit from repl loop
         break;
      }
   }
   return 0;
}

test: 对于Dialect实现的lit(LLVM Integrated Tester)的mlir与sqlite的数据输入

resources:SQL数据与Apache Arrow的相关数据

llvm-project: 关于LLVM的Submodule

**TUM居然自己修改了一版LLVM!!!**确实是研究型数据库🥵(看commit时间应该介于LLVM17-LLVM18之间)

image-20240802181800740

lib: 项目实现的静态文件

Conversion实现论文的层级流转

include:项目的库文件

utility下面是Tracer的相关实现文件

fronetend下面有SQL解析的头文件

execution涉及SQL运行细节(使用Arrow Compute实现,并用Intel的OneAPI进行加速?)

runtime有很多细节实现

runtime/Catalog.h涉及文件读取

runtime/Realtion.h涉及文件关系,Arrow读取则依赖PyArrow的库

让我惊讶的是:.venv/lib/python3.10/site-packages下面的库也被调用了!(即PyArrow包装的C++库)

eval有相关操作的实现

parser.cpp

非常经典的Parser,使用Enum量构建AST树

std::optional<mlir::Value> frontend::sql::Parser::translate(mlir::OpBuilder& builder) {
   if (result.tree && result.tree->length == 1) {
      auto* statement = static_cast<Node*>(result.tree->head->data.ptr_value);
      switch (statement->type) {
         case T_VariableSetStmt: {
            auto* variableSetStatement = reinterpret_cast<VariableSetStmt*>(statement);
            translateVariableSetStatement(builder, variableSetStatement);
            break;
         }
         case T_CreateStmt: {
            translateCreateStatement(builder, reinterpret_cast<CreateStmt*>(statement));
            break;
         }
         case T_CopyStmt: {
            auto* copyStatement = reinterpret_cast<CopyStmt*>(statement);
            translateCopyStatement(builder, copyStatement);
            break;
         }
         case T_SelectStmt: {
            parallelismAllowed = true;
            TranslationContext context;
            auto scope = context.createResolverScope();
            auto [tree, targetInfo] = translateSelectStmt(builder, reinterpret_cast<SelectStmt*>(statement), context, scope);
            //::mlir::Type result, ::mlir::Value rel, ::mlir::ArrayAttr attrs, ::mlir::ArrayAttr columns
            std::vector<mlir::Attribute> attrs;
            std::vector<mlir::Attribute> names;
            std::vector<mlir::Attribute> colMemberNames;
            std::vector<mlir::Attribute> colTypes;
            auto& memberManager = builder.getContext()->getLoadedDialect<mlir::subop::SubOperatorDialect>()->getMemberManager();
            for (auto x : targetInfo.namedResults) {
               if (x.first == "primaryKeyHashValue") continue;
               names.push_back(builder.getStringAttr(x.first));
               auto colMemberName = memberManager.getUniqueMember(x.first.empty() ? "unnamed" : x.first);
               auto columnType = x.second->type;
               attrs.push_back(attrManager.createRef(x.second));
               colTypes.push_back(mlir::TypeAttr::get(columnType));
               colMemberNames.push_back(builder.getStringAttr(colMemberName));
            }
            auto resultTableType = mlir::subop::ResultTableType::get(builder.getContext(), mlir::subop::StateMembersAttr::get(builder.getContext(), builder.getArrayAttr(colMemberNames), builder.getArrayAttr(colTypes)));
            return builder.create<mlir::relalg::MaterializeOp>(builder.getUnknownLoc(), resultTableType, tree, builder.getArrayAttr(attrs), builder.getArrayAttr(names));
         }
         case T_InsertStmt: {
            translateInsertStmt(builder, reinterpret_cast<InsertStmt*>(statement));
            break;
         }
         default:
           throw std::runtime_error("unsupported statement type");
      }
   }
   return {};
}

Docker镜像运行

编译环境是不可能折腾的!

有Docker的话当然用Docker🤣

docker pull ghcr.io/lingo-db/lingo-db:latest
docker run -it --name lingo  ghcr.io/lingo-db/lingo-db /bin/bash

编译好的文件默认在build/lingodb-release

貌似跑了TCP-DS?

有一个Lingodbllvm在Python环境里面,所以LingoProject下面是空的

修改下官网给的测试(Data部分需要从外部拷入

c51ad1708246c6cfd0d992b74331db9修改后的shell:

echo "select * from studenten where name='Carnap'" > test.sql
./sql-to-mlir test.sql /repo/resources/data/uni/ > canonical.mlir
./mlir-db-opt --use-db /repo/resources/data/uni/ --relalg-query-opt canonical.mlir > optimized.mlir
./mlir-db-opt --lower-relalg-to-subop optimized.mlir > subop.mlir
./mlir-db-opt --lower-subop subop.mlir > hl-imperative.mlir
./mlir-db-opt --lower-db hl-imperative.mlir > ml-imperative.mlir
./mlir-db-opt --lower-dsa ml-imperative.mlir > ll-imperative.mlir

制作Debug版本

内存建议大于8GB,编译中出现的任何错误一律按内存不够大处理(问就是试过了)😅

你也可以使用我打包好的的Docker Image:

docker pull ccr.ccs.tencentyun.com/mocusz/lingo-debug

编译完后,安装GDB

apt install gdb

配置VScode的Launch.json文件(供参考,有需要的自己添加,主要是argcwd)

{
    // Use IntelliSense to learn about possible attributes.
    // Hover to view descriptions of existing attributes.
    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
    "version": "0.2.0",
    "configurations": [
        {
            "name": "(gdb) Launch sql-to-mlir",
            "type": "cppdbg",
            "request": "launch",
            "program": "${workspaceFolder}/build/lingodb-debug/sql-to-mlir",
            "args": ["./build/lingodb-debug/test.sql","./resources/data/uni/"],
            "stopAtEntry": false,
            "cwd": "${workspaceFolder}",
            "environment": [],
            "externalConsole": false,
            "MIMode": "gdb",
            "setupCommands": [
                {
                    "description": "Enable pretty-printing for gdb",
                    "text": "-enable-pretty-printing",
                    "ignoreFailures": true
                },
                {
                    "description": "Set Disassembly Flavor to Intel",
                    "text": "-gdb-set disassembly-flavor intel",
                    "ignoreFailures": true
                }
            ]
        },
    ]
}

效果如图:

image-20240806162949102