https://github.com/duckdb/duckdb

https://www.duckdb.org/docs/current/sql/statements/overview

数据清洗

-- 导入数据
CREATE TABLE tbl1 AS
FROM
  '/Users/leo/Downloads/123.csv';

CREATE OR REPLACE TABLE tbl1 AS
FROM
  read_csv(
    '/Users/leo/Downloads/*.csv',
    header = true,
    filename = true,
    union_by_name = true
  );

-- 追加数据
INSERT INTO tbl1 BY NAME
FROM read_csv('/Users/leo/Downloads/123.csv');

-- 查找没有任何数据的列
SELECT
  list(column_name)
FROM
  (SUMMARIZE tbl1)
WHERE
  null_percentage = 100;

-- 清洗数据(创建新表)
CREATE OR REPLACE TABLE tbl2 AS
SELECT DISTINCT ON ("Issue key") -- 排除重复行
  * EXCLUDE ('Due Date', 'Environment', ...) -- 排除空数据列
FROM
  tbl1;

-- 导出数据
COPY tbl2 TO '/Users/leo/Downloads/output.csv' (HEADER, DELIMITER ',');

COPY (
  SELECT
    "Issue key",
    "Priority",
    "Summary",
    "Description"
  FROM
    tbl2
) TO '/Users/leo/Downloads/output.csv' (HEADER, DELIMITER ',');

-- 分区导出(按行拆分)
COPY (
  SELECT
    "Issue key",
    "Priority",
    "Summary",
    "Description"
  FROM
    tbl2
) TO '/Users/leo/Downloads/output/' (
  FORMAT csv,
  PARTITION_BY ("Issue key"),
  WRITE_PARTITION_COLUMNS,
  OVERWRITE_OR_IGNORE
);