Re: [PR] Features add csv to tsfile [tsfile]

via GitHub Thu, 04 Jul 2024 01:34:01 -0700


CloudWise-Lukemiao commented on code in PR #139:
URL: https://github.com/apache/tsfile/pull/139#discussion_r1665341825



##########
java/tools/src/main/java/org/apache/tsfile/tools/SchemaParser.java:
##########
@@ -0,0 +1,271 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.tsfile.tools;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+public class SchemaParser {
+
+  public static class Schema {
+    String tableName;
+    String timePrecision;
+    boolean hasHeader = true;
+    String separator;
+    String nullFormat;
+    String timeColumn;
+    int timeColumnIndex = -1;
+    List<IDColumns> idColumns = new ArrayList<>();
+    List<Column> csvColumns = new ArrayList<>();
+
+    @Override
+    public String toString() {
+      return "Schema{"
+          + "tableName='"
+          + tableName
+          + '\''
+          + ", timePrecision='"
+          + timePrecision
+          + '\''
+          + ", hasHeader="
+          + hasHeader
+          + ", separator='"
+          + separator
+          + '\''
+          + ", nullFormat='"
+          + nullFormat
+          + '\''
+          + ", timeColumn='"
+          + timeColumn
+          + '\''
+          + ", idColumns="
+          + idColumns
+          + ", csvColumns="
+          + csvColumns
+          + '}';
+    }
+  }
+
+  public static class Column {
+    String name;
+    String type;
+
+    boolean isSkip;
+
+    public Column(String name, String type) {
+      this.name = name;
+      this.isSkip = false;
+      this.type = type;
+    }
+
+    public Column(String name) {
+      this.name = name;
+      this.isSkip = true;
+    }
+
+    @Override
+    public String toString() {
+      return "Column{"
+          + "name='"
+          + name
+          + '\''
+          + ", type='"
+          + type
+          + '\''
+          + ", isSkip="
+          + isSkip
+          + '}';
+    }
+  }
+
+  public static class IDColumns {
+    String name;
+    boolean isDefault;
+    String defaultValue;
+    int csvColumnIndex = -1;
+    boolean isExistCsvColumn;
+
+    public IDColumns(String name, boolean isDefault, String defaultValue) {
+      this.name = name;
+      this.isDefault = isDefault;
+      if (isDefault) {
+        this.defaultValue = defaultValue;
+        this.isExistCsvColumn = false;
+      }
+    }
+
+    public IDColumns(String name) {
+      this.name = name;
+      this.isDefault = false;
+      this.isExistCsvColumn = true;
+    }
+
+    @Override
+    public String toString() {
+      return "IDColumns{"
+          + "name='"
+          + name
+          + '\''
+          + ", isDefault="
+          + isDefault
+          + ", defaultValue='"
+          + defaultValue
+          + '\''
+          + ", isExistCsvColumn="
+          + isExistCsvColumn
+          + ", csvColumnIndex="
+          + csvColumnIndex
+          + '}';
+    }
+  }
+
+  public static Schema parseSchema(String filePath) throws IOException {
+    Schema schema = new Schema();
+    try (BufferedReader reader = new BufferedReader(new FileReader(filePath))) 
{
+      String line;
+      boolean readingIdColumns = false;
+      boolean readingCsvColumns = false;
+      int timeIndex = 0;
+      while ((line = reader.readLine()) != null) {
+        line = line.trim();
+        if (line.isEmpty() || line.startsWith("//")) continue;
+
+        if (line.startsWith("table_name=")) {
+          schema.tableName = extractValue(line);
+        } else if (line.startsWith("time_precision=")) {
+          schema.timePrecision = extractValue(line);
+        } else if (line.startsWith("has_header=")) {
+          schema.hasHeader = Boolean.parseBoolean(extractValue(line));
+        } else if (line.startsWith("separator=")) {
+          schema.separator = extractValue(line);
+        } else if (line.startsWith("null_format=")) {
+          schema.nullFormat = extractValue(line);
+        } else if (line.startsWith("time_column=")) {
+          schema.timeColumn = extractValue(line);
+        } else if (line.equals("id_columns")) {
+          readingIdColumns = true;
+          readingCsvColumns = false;
+        } else if (line.equals("csv_columns")) {
+          readingIdColumns = false;
+          readingCsvColumns = true;
+        } else if (readingIdColumns) {
+          parseIdColumns(line, schema);
+        } else if (readingCsvColumns) {
+          parseCsvColumns(line, schema, timeIndex);
+          timeIndex++;
+        }
+      }
+      addIdColumnsIndex(schema);
+    }
+    validateParams(schema);
+    if (schema.separator.equals("tab")) {
+      schema.separator = "\t";
+    }
+    return schema;
+  }
+
+  private static String extractValue(String line) {
+    return line.split("=")[1].trim();
+  }
+
+  private static void parseIdColumns(String line, Schema schema) {
+    String[] parts = line.split(" ");
+    if (parts.length == 3) {
+      schema.idColumns.add(
+          new IDColumns(
+              parts[0].trim(), parts[1].trim().equalsIgnoreCase("DEFAULT"), 
parts[2].trim()));
+    } else if (parts.length == 1) {
+      schema.idColumns.add(new IDColumns(parts[0].trim()));
+    } else {
+      throw new IllegalArgumentException("The data format of id_columns is 
incorrect");
+    }
+  }
+
+  private static void addIdColumnsIndex(Schema schema) {
+    List<IDColumns> idColumnsList = schema.idColumns;
+    List<Column> columnList = schema.csvColumns;
+    for (IDColumns idColumn : idColumnsList) {
+      if (!idColumn.isDefault) {
+        for (int j = 0; j < columnList.size(); j++) {
+          if (columnList.get(j).name.equals(idColumn.name)) {
+            idColumn.csvColumnIndex = j;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  private static void parseCsvColumns(String line, Schema schema, int 
timeIndex) {
+    String[] parts = line.split(" ");
+    String columnName = parts[0].trim();
+
+    if (parts.length == 2) {
+      String dataType = parts[1].trim();
+      if (dataType.endsWith(",") || dataType.endsWith(";")) {
+        dataType = dataType.substring(0, dataType.length() - 1);
+      }
+      if (columnName.equals(schema.timeColumn)) {
+        schema.timeColumnIndex = timeIndex;
+      }
+      schema.csvColumns.add(new Column(columnName, dataType));
+    } else if (parts.length == 1) {
+      if (columnName.endsWith(",") || columnName.endsWith(";")) {
+        columnName = columnName.substring(0, columnName.length() - 1);
+      }
+      schema.csvColumns.add(new Column(columnName));
+    } else {
+      System.out.println("The data format of csv_columns is incorrect");
+    }
+  }
+
+  private static void validateParams(SchemaParser.Schema schema) {
+    if (!schema.timePrecision.equals("us")
+        && !schema.timePrecision.equals("ms")
+        && !schema.timePrecision.equals("ns")) {
+      throw new IllegalArgumentException("timePrecision must be us,ms or ns");
+    }
+    if (!schema.separator.equals(",")
+        && !schema.separator.equals("tab")
+        && !schema.separator.equals(";")) {
+      throw new IllegalArgumentException("separator must be , or tab or ;");

Review Comment:
   fixed



##########
java/tools/README-zh.md:
##########
@@ -0,0 +1,129 @@
+<!--
+
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+-->
+
+[English](./README.md) | [中文](./README-zh.md)
+# TsFile Tools 手册
+## 简介
+
+## 开发
+
+### 前置条件
+
+构建 Java 版的 TsFile Tools，必须要安装以下依赖:
+
+1. Java >= 1.8 (1.8, 11 到 17 都经过验证. 请确保设置了环境变量).
+2. Maven >= 3.6 (如果要从源代码编译TsFile).
+
+
+### 使用 maven 构建
+
+```
+mvn clean package -P with-java -DskipTests
+```
+
+### 安装到本地机器
+
+```
+mvn install -P with-java -DskipTests
+```
+
+## schema 定义
+
+| 参数         | 说明                       | 是否必填 | 默认值  |
+|------------|--------------------------|------|------|
+| table_name | 表名                       | 是    |      |
+| time_precision | 时间精度（可选值有：ms/us/ns）      | 否    | ms   |
+| has_header | 是否包含表头 (可选值有：true/false) | 否    | true |
+| separator | 行内分隔符（可选值有：, /tab/ ;）    | 否    | ,    |
+| null_format | 空值                       | 否    |    |
+| id_columns | 主键列，支持cvs中不存在的列做为层级      | 否    |      |
+| time_column | 时间列                      | 是    |      |
+| csv_columns | 按照顺序与csv列一一对应            | 是    |      |
+
+说明：
+
+id_columns 按照顺序进行设置值，支持加csv 文件中不存在的列作为层级
+例如csv 只有a,b,c,d,time 则

Review Comment:
   fixed



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Features add csv to tsfile [tsfile]

Reply via email to