4. 使用sql查詢excel內容

1. 簡介

我們在前面的文章中提到了calcite支持csv和json文件的數據源適配, 其實就是將文件解析成表然后以文件夾為schema, 然后將生成的schema注冊到RootSehema(RootSchema是所有數據源schema的parent，多個不同數據源schema可以掛在同一個RootSchema下)下, 最終使用calcite的特性進行sql的解析查詢返回.

但其實我們的數據文件一般使用excel進行存儲,流轉, 但很可惜, calcite本身沒有excel的適配器, 但其實我們可以模仿calcite-file, 自己搞一個calcite-file-excel, 也可以熟悉calcite的工作原理.

2. 實現思路

因為excel有sheet的概念, 所以可以將一個excel解析成schema, 每個sheet解析成table, 實現步驟如下:

實現SchemaFactory重寫create方法: schema工廠用于創建schema
繼承AbstractSchema: schema描述類用于解析excel, 創建table(解析sheet)
繼承AbstractTable, ScannableTable: table描述類提供字段信息和數據內容等(解析sheet data)

3. Excel樣例

excel有兩個sheet頁, 分別是user_info 和 role_info如下:

ok, 萬事具備.

4. Maven

<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml</artifactId>
    <version>5.2.3</version>
</dependency>

<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>5.2.3</version>
</dependency>

<dependency>
    <groupId>org.apache.calcite</groupId>
    <artifactId>calcite-core</artifactId>
    <version>1.37.0</version>
</dependency>

5. 核心代碼

5.1 SchemaFactory

package com.ldx.calcite.excel;

import com.google.common.collect.Lists;
import org.apache.calcite.schema.Schema;
import org.apache.calcite.schema.SchemaFactory;
import org.apache.calcite.schema.SchemaPlus;
import org.apache.commons.lang3.ObjectUtils;
import org.apache.commons.lang3.StringUtils;

import java.io.File;
import java.util.List;
import java.util.Map;

/**
 * schema factory
 */
public class ExcelSchemaFactory implements SchemaFactory {
    public final static ExcelSchemaFactory INSTANCE = new ExcelSchemaFactory();

    private ExcelSchemaFactory(){}

    @Override
    public Schema create(SchemaPlus parentSchema, String name, Map<String, Object> operand) {
        final Object filePath = operand.get("filePath");

        if (ObjectUtils.isEmpty(filePath)) {
            throw new NullPointerException("can not find excel file");
        }

        return this.create(filePath.toString());
    }

    public Schema create(String excelFilePath) {
        if (StringUtils.isBlank(excelFilePath)) {
            throw new NullPointerException("can not find excel file");
        }

        return this.create(new File(excelFilePath));
    }

    public Schema create(File excelFile) {
        if (ObjectUtils.isEmpty(excelFile) || !excelFile.exists()) {
            throw new NullPointerException("can not find excel file");
        }

        if (!excelFile.isFile() || !isExcelFile(excelFile)) {
            throw new RuntimeException("can not find excel file: " + excelFile.getAbsolutePath());
        }

        return new ExcelSchema(excelFile);
    }

    protected List<String> supportedFileSuffix() {
        return Lists.newArrayList("xls", "xlsx");
    }

    private boolean isExcelFile(File excelFile) {
        if (ObjectUtils.isEmpty(excelFile)) {
            return false;
        }

        final String name = excelFile.getName();
        return StringUtils.endsWithAny(name, this.supportedFileSuffix().toArray(new String[0]));
    }
}

schema中有多個重載的create方法用于方便的創建schema, 最終將excel file 交給ExcelSchema創建一個schema對象

5.2 Schema

package com.ldx.calcite.excel;

import org.apache.calcite.schema.Table;
import org.apache.calcite.schema.impl.AbstractSchema;
import org.apache.commons.lang3.ObjectUtils;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;
import org.testng.collections.Maps;

import java.io.File;
import java.util.Iterator;
import java.util.Map;

/**
 * schema
 */
public class ExcelSchema extends AbstractSchema {
    private final File excelFile;

    private Map<String, Table> tableMap;

    public ExcelSchema(File excelFile) {
        this.excelFile = excelFile;
    }

    @Override
    protected Map<String, Table> getTableMap() {
        if (ObjectUtils.isEmpty(tableMap)) {
            tableMap = createTableMap();
        }

        return tableMap;
    }

    private Map<String, Table> createTableMap() {
        final Map<String, Table> result = Maps.newHashMap();

        try (Workbook workbook = WorkbookFactory.create(excelFile)) {
            final Iterator<Sheet> sheetIterator = workbook.sheetIterator();

            while (sheetIterator.hasNext()) {
                final Sheet sheet = sheetIterator.next();
                final ExcelScannableTable excelScannableTable = new ExcelScannableTable(sheet, null);
                result.put(sheet.getSheetName(), excelScannableTable);
            }
        }
        catch (Exception ignored) {}

        return result;
    }
}

schema類讀取Excel file, 并循環讀取sheet, 將每個sheet解析成ExcelScannableTable并存儲

5.3 Table

package com.ldx.calcite.excel;

import com.google.common.collect.Lists;
import com.ldx.calcite.excel.enums.JavaFileTypeEnum;
import org.apache.calcite.DataContext;
import org.apache.calcite.adapter.java.JavaTypeFactory;
import org.apache.calcite.linq4j.Enumerable;
import org.apache.calcite.linq4j.Linq4j;
import org.apache.calcite.rel.type.RelDataType;
import org.apache.calcite.rel.type.RelDataTypeFactory;
import org.apache.calcite.rel.type.RelProtoDataType;
import org.apache.calcite.schema.ScannableTable;
import org.apache.calcite.schema.impl.AbstractTable;
import org.apache.calcite.sql.type.SqlTypeName;
import org.apache.calcite.util.Pair;
import org.apache.commons.lang3.ObjectUtils;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.checkerframework.checker.nullness.qual.Nullable;

import java.util.List;

/**
 * table
 */
public class ExcelScannableTable extends AbstractTable implements ScannableTable {
    private final RelProtoDataType protoRowType;

    private final Sheet sheet;

    private RelDataType rowType;

    private List<JavaFileTypeEnum> fieldTypes;

    private List<Object[]> rowDataList;

    public ExcelScannableTable(Sheet sheet, RelProtoDataType protoRowType) {
        this.protoRowType = protoRowType;
        this.sheet = sheet;
    }

    @Override
    public Enumerable<@Nullable Object[]> scan(DataContext root) {
        JavaTypeFactory typeFactory = root.getTypeFactory();
        final List<JavaFileTypeEnum> fieldTypes = this.getFieldTypes(typeFactory);

        if (rowDataList == null) {
            rowDataList = readExcelData(sheet, fieldTypes);
        }

        return Linq4j.asEnumerable(rowDataList);
    }

    @Override
    public RelDataType getRowType(RelDataTypeFactory typeFactory) {
        if (ObjectUtils.isNotEmpty(protoRowType)) {
            return protoRowType.apply(typeFactory);
        }

        if (ObjectUtils.isEmpty(rowType)) {
            rowType = deduceRowType((JavaTypeFactory) typeFactory, sheet, null);
        }

        return rowType;
    }

    public List<JavaFileTypeEnum> getFieldTypes(RelDataTypeFactory typeFactory) {
        if (fieldTypes == null) {
            fieldTypes = Lists.newArrayList();
            deduceRowType((JavaTypeFactory) typeFactory, sheet, fieldTypes);
        }
        return fieldTypes;
    }

    private List<Object[]> readExcelData(Sheet sheet, List<JavaFileTypeEnum> fieldTypes) {
        List<Object[]> rowDataList = Lists.newArrayList();

        for (int rowIndex = 1; rowIndex <= sheet.getLastRowNum(); rowIndex++) {
            Row row = sheet.getRow(rowIndex);
            Object[] rowData = new Object[fieldTypes.size()];

            for (int i = 0; i < row.getLastCellNum(); i++) {
                final JavaFileTypeEnum javaFileTypeEnum = fieldTypes.get(i);
                Cell cell = row.getCell(i, Row.MissingCellPolicy.CREATE_NULL_AS_BLANK);
                final Object cellValue = javaFileTypeEnum.getCellValue(cell);
                rowData[i] = cellValue;
            }

            rowDataList.add(rowData);
        }

        return rowDataList;
    }

    public static RelDataType deduceRowType(JavaTypeFactory typeFactory, Sheet sheet, List<JavaFileTypeEnum> fieldTypes) {
        final List<String> names = Lists.newArrayList();
        final List<RelDataType> types = Lists.newArrayList();

        if (sheet != null) {
            Row headerRow = sheet.getRow(0);

            if (headerRow != null) {
                for (int i = 0; i < headerRow.getLastCellNum(); i++) {
                    Cell cell = headerRow.getCell(i, Row.MissingCellPolicy.CREATE_NULL_AS_BLANK);
                    String[] columnInfo = cell
                            .getStringCellValue()
                            .split(":");
                    String columnName = columnInfo[0].trim();
                    String columnType = null;

                    if (columnInfo.length == 2) {
                        columnType = columnInfo[1].trim();
                    }

                    final JavaFileTypeEnum javaFileType = JavaFileTypeEnum
                            .of(columnType)
                            .orElse(JavaFileTypeEnum.UNKNOWN);
                    final RelDataType sqlType = typeFactory.createSqlType(javaFileType.getSqlTypeName());
                    names.add(columnName);
                    types.add(sqlType);

                    if (fieldTypes != null) {
                        fieldTypes.add(javaFileType);
                    }
                }
            }
        }

        if (names.isEmpty()) {
            names.add("line");
            types.add(typeFactory.createSqlType(SqlTypeName.VARCHAR));
        }

        return typeFactory.createStructType(Pair.zip(names, types));
    }
}

table類中其中有兩個比較關鍵的方法

scan: 掃描表內容, 我們這里將sheet頁面的數據內容解析存儲最后交給calcite
getRowType: 獲取字段信息, 我們這里默認使用第一條記錄作為表頭(row[0]) 并解析為字段信息, 字段規則跟csv一樣 name:string, 冒號前面的是字段key, 冒號后面的是字段類型, 如果未指定字段類型, 則解析為UNKNOWN, 后續JavaFileTypeEnum會進行類型推斷, 最終在結果處理時calcite也會進行推斷
deduceRowType: 推斷字段類型, 方法中使用JavaFileTypeEnum枚舉類對java type & sql type & 字段值轉化處理方法進行管理

5.4 ColumnTypeEnum

package com.ldx.calcite.excel.enums;

import lombok.Getter;
import lombok.extern.slf4j.Slf4j;
import org.apache.calcite.avatica.util.DateTimeUtils;
import org.apache.calcite.sql.type.SqlTypeName;
import org.apache.commons.lang3.ObjectUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.time.FastDateFormat;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.ss.util.CellUtil;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Date;
import java.util.Optional;
import java.util.TimeZone;
import java.util.function.Function;

/**
 * type converter
 */
@Slf4j
@Getter
public enum JavaFileTypeEnum {
    STRING("string", SqlTypeName.VARCHAR, Cell::getStringCellValue),
    BOOLEAN("boolean", SqlTypeName.BOOLEAN, Cell::getBooleanCellValue),
    BYTE("byte", SqlTypeName.TINYINT, Cell::getStringCellValue),
    CHAR("char", SqlTypeName.CHAR, Cell::getStringCellValue),
    SHORT("short", SqlTypeName.SMALLINT, Cell::getNumericCellValue),
    INT("int", SqlTypeName.INTEGER, cell -> (Double.valueOf(cell.getNumericCellValue()).intValue())),
    LONG("long", SqlTypeName.BIGINT, cell -> (Double.valueOf(cell.getNumericCellValue()).longValue())),
    FLOAT("float", SqlTypeName.REAL, Cell::getNumericCellValue),
    DOUBLE("double", SqlTypeName.DOUBLE, Cell::getNumericCellValue),
    DATE("date", SqlTypeName.DATE, getValueWithDate()),
    TIMESTAMP("timestamp", SqlTypeName.TIMESTAMP, getValueWithTimestamp()),
    TIME("time", SqlTypeName.TIME, getValueWithTime()),
    UNKNOWN("unknown", SqlTypeName.UNKNOWN, getValueWithUnknown()),;
    // cell type
    private final String typeName;
	// sql type
    private final SqlTypeName sqlTypeName;
    // value convert func
    private final Function<Cell, Object> cellValueFunc;

    private static final FastDateFormat TIME_FORMAT_DATE;

    private static final FastDateFormat TIME_FORMAT_TIME;

    private static final FastDateFormat TIME_FORMAT_TIMESTAMP;

    static {
        final TimeZone gmt = TimeZone.getTimeZone("GMT");
        TIME_FORMAT_DATE = FastDateFormat.getInstance("yyyy-MM-dd", gmt);
        TIME_FORMAT_TIME = FastDateFormat.getInstance("HH:mm:ss", gmt);
        TIME_FORMAT_TIMESTAMP = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss", gmt);
    }

    JavaFileTypeEnum(String typeName, SqlTypeName sqlTypeName, Function<Cell, Object> cellValueFunc) {
        this.typeName = typeName;
        this.sqlTypeName = sqlTypeName;
        this.cellValueFunc = cellValueFunc;
    }

    public static Optional<JavaFileTypeEnum> of(String typeName) {
        return Arrays
                .stream(values())
                .filter(type -> StringUtils.equalsIgnoreCase(typeName, type.getTypeName()))
                .findFirst();
    }

    public static SqlTypeName findSqlTypeName(String typeName) {
        final Optional<JavaFileTypeEnum> javaFileTypeOptional = of(typeName);

        if (javaFileTypeOptional.isPresent()) {
            return javaFileTypeOptional
                    .get()
                    .getSqlTypeName();
        }

        return SqlTypeName.UNKNOWN;
    }

    public Object getCellValue(Cell cell) {
        return cellValueFunc.apply(cell);
    }

    public static Function<Cell, Object> getValueWithUnknown() {
        return cell -> {
            if (ObjectUtils.isEmpty(cell)) {
                return null;
            }

            switch (cell.getCellType()) {
                case STRING:
                    return cell.getStringCellValue();
                case NUMERIC:
                    if (DateUtil.isCellDateFormatted(cell)) {
                        // 如果是日期類型，返回日期對象
                        return cell.getDateCellValue();
                    }
                    else {
                        // 否則返回數值
                        return cell.getNumericCellValue();
                    }
                case BOOLEAN:
                    return cell.getBooleanCellValue();
                case FORMULA:
                    // 對于公式單元格，先計算公式結果，再獲取其值
                    try {
                        return cell.getNumericCellValue();
                    }
                    catch (Exception e) {
                        try {
                            return cell.getStringCellValue();
                        }
                        catch (Exception ex) {
                            log.error("parse unknown data error, cellRowIndex:{}, cellColumnIndex:{}", cell.getRowIndex(), cell.getColumnIndex(), e);
                            return null;
                        }
                    }
                case BLANK:
                    return "";
                default:
                    return null;
            }
        };
    }

    public static Function<Cell, Object> getValueWithDate() {
        return cell -> {
            Date date = cell.getDateCellValue();

            if(ObjectUtils.isEmpty(date)) {
                return null;
            }

            try {
                final String formated = new SimpleDateFormat("yyyy-MM-dd").format(date);
                Date newDate = TIME_FORMAT_DATE.parse(formated);
                return (int) (newDate.getTime() / DateTimeUtils.MILLIS_PER_DAY);
            }
            catch (ParseException e) {
                log.error("parse date error, date:{}", date, e);
            }

            return null;
        };
    }

    public static Function<Cell, Object> getValueWithTimestamp() {
        return cell -> {
            Date date = cell.getDateCellValue();

            if(ObjectUtils.isEmpty(date)) {
                return null;
            }

            try {
                final String formated = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(date);
                Date newDate = TIME_FORMAT_TIMESTAMP.parse(formated);
                return (int) newDate.getTime();
            }
            catch (ParseException e) {
                log.error("parse timestamp error, date:{}", date, e);
            }

            return null;
        };
    }

    public static Function<Cell, Object> getValueWithTime() {
        return cell -> {
            Date date = cell.getDateCellValue();

            if(ObjectUtils.isEmpty(date)) {
                return null;
            }

            try {
                final String formated = new SimpleDateFormat("HH:mm:ss").format(date);
                Date newDate = TIME_FORMAT_TIME.parse(formated);
                return newDate.getTime();
            }
            catch (ParseException e) {
                log.error("parse time error, date:{}", date, e);
            }

            return null;
        };
    }
}

該枚舉類主要管理了java type& sql type & cell value convert func, 方便統一管理類型映射及單元格內容提取時的轉換方法(這里借用了java8 function函數特性)

注: 這里的日期轉換只能這樣寫, 即使用GMT的時區(抄的calcite-file), 要不然輸出的日期時間一直有時差...

6. 測試查詢

package com.ldx.calcite;

import com.ldx.calcite.excel.ExcelSchemaFactory;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.apache.calcite.config.CalciteConnectionProperty;
import org.apache.calcite.jdbc.CalciteConnection;
import org.apache.calcite.schema.Schema;
import org.apache.calcite.schema.SchemaPlus;
import org.apache.calcite.util.Sources;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.testng.collections.Maps;

import java.net.URL;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.Map;
import java.util.Properties;

@Slf4j
public class CalciteExcelTest {
    private static Connection connection;

    private static SchemaPlus rootSchema;

    private static CalciteConnection calciteConnection;

    @BeforeAll
    @SneakyThrows
    public static void beforeAll() {
        Properties info = new Properties();
        // 不區分sql大小寫
        info.setProperty(CalciteConnectionProperty.CASE_SENSITIVE.camelName(), "false");
        // 創建Calcite連接
        connection = DriverManager.getConnection("jdbc:calcite:", info);
        calciteConnection = connection.unwrap(CalciteConnection.class);
        // 構建RootSchema，在Calcite中，RootSchema是所有數據源schema的parent，多個不同數據源schema可以掛在同一個RootSchema下
        rootSchema = calciteConnection.getRootSchema();
    }

    @Test
    @SneakyThrows
    public void test_execute_query() {
        final Schema schema = ExcelSchemaFactory.INSTANCE.create(resourcePath("file/test.xlsx"));
        rootSchema.add("test", schema);
        // 設置默認的schema
        calciteConnection.setSchema("test");
        final Statement statement = calciteConnection.createStatement();
        ResultSet resultSet = statement.executeQuery("SELECT * FROM user_info");
        printResultSet(resultSet);
        System.out.println("=========");
        ResultSet resultSet2 = statement.executeQuery("SELECT * FROM test.user_info where id > 110 and birthday > '2003-01-01'");
        printResultSet(resultSet2);
        System.out.println("=========");
        ResultSet resultSet3 = statement.executeQuery("SELECT * FROM test.user_info ui inner join  test.role_info ri on ui.role_id = ri.id");
        printResultSet(resultSet3);
    }

    @AfterAll
    @SneakyThrows
    public static void closeResource() {
        connection.close();
    }

    private static String resourcePath(String path) {
        final URL url = CalciteExcelTest.class.getResource("/" + path);
        return Sources.of(url).file().getAbsolutePath();
    }

    public static void printResultSet(ResultSet resultSet) throws SQLException {
        // 獲取 ResultSet 元數據
        ResultSetMetaData metaData = resultSet.getMetaData();

        // 獲取列數
        int columnCount = metaData.getColumnCount();
        log.info("Number of columns: {}",columnCount);

        // 遍歷 ResultSet 并打印結果
        while (resultSet.next()) {
            final Map<String, String> item = Maps.newHashMap();
            // 遍歷每一列并打印
            for (int i = 1; i <= columnCount; i++) {
                String columnName = metaData.getColumnName(i);
                String columnValue = resultSet.getString(i);
                item.put(columnName, columnValue);
            }

            log.info(item.toString());
        }
    }
}

測試結果如下:

posted @ 2025-01-22 08:24 張鐵牛閱讀(1139) 評論(20) 收藏舉報

刷新頁面返回頂部

張鐵牛