ORC-1742: Suppor print the id, name and type of each column in dump tool

### What changes were proposed in this pull request? This PR aims to suppor print the id, name and type of each column in dump tool. ### Why are the changes needed? When we dump an ORC with a complex structure, we only output the column id of each column, but we do not know the column name and type corresponding to the column id. If we use json format, it will output the id, name and type of each column. ### How was this patch tested? Add UT ### Was this patch authored or co-authored using generative AI tooling? No Closes #1974 from cxzl25/ORC-1742. Authored-by: sychen <[email protected]> Signed-off-by: Dongjoon Hyun <[email protected]>
apache · Jul 11, 2024 · 8ca3a23 · 8ca3a23
1 parent a9e0351
commit 8ca3a23
Show file tree

Hide file tree

Showing 4 changed files with 218 additions and 4 deletions.
diff --git a/java/tools/src/java/org/apache/orc/tools/FileDump.java b/java/tools/src/java/org/apache/orc/tools/FileDump.java
@@ -134,7 +134,9 @@ public static void main(Configuration conf, String[] args) throws Exception {
         boolean prettyPrint = cli.hasOption('p');
         JsonFileDump.printJsonMetaData(filesInPath, conf, rowIndexCols, prettyPrint, printTimeZone);
       } else {
-        printMetaData(filesInPath, conf, rowIndexCols, printTimeZone, recover, backupPath);
+        boolean printColumnType = cli.hasOption("column-type");
+        printMetaData(filesInPath, conf, rowIndexCols, printTimeZone, recover, backupPath,
+            printColumnType);
       }
     }
   }
@@ -268,11 +270,11 @@ public static Collection<String> getAllFilesInPath(final Path path,
 
   private static void printMetaData(List<String> files, Configuration conf,
       List<Integer> rowIndexCols, boolean printTimeZone, final boolean recover,
-      final String backupPath)
+      final String backupPath, final boolean printColumnType)
       throws IOException {
     List<String> corruptFiles = new ArrayList<>();
     for (String filename : files) {
-      printMetaDataImpl(filename, conf, rowIndexCols, printTimeZone, corruptFiles);
+      printMetaDataImpl(filename, conf, rowIndexCols, printTimeZone, corruptFiles, printColumnType);
       System.out.println(SEPARATOR);
     }
 
@@ -294,6 +296,15 @@ private static void printMetaData(List<String> files, Configuration conf,
     }
   }
 
+  static void printColumnsType(TypeDescription schema) {
+    int maximumId = schema.getMaximumId();
+    for (int c = schema.getId(); c < maximumId + 1; ++c) {
+      TypeDescription type = schema.findSubtype(c);
+      System.out.println("  Column " + type.getId() + ": field: " + type.getFullFieldName() +
+          " type: " + type.toString());
+    }
+  }
+
   static void printTypeAnnotations(TypeDescription type, String prefix) {
     List<String> attributes = type.getAttributeNames();
     if (attributes.size() > 0) {
@@ -329,7 +340,7 @@ static void printTypeAnnotations(TypeDescription type, String prefix) {
 
   private static void printMetaDataImpl(final String filename,
       final Configuration conf, List<Integer> rowIndexCols, final boolean printTimeZone,
-      final List<String> corruptFiles) throws IOException {
+      final List<String> corruptFiles, final boolean printColumnType) throws IOException {
     Path file = new Path(filename);
     Reader reader = getReader(file, conf, corruptFiles);
     // if we can create reader then footer is not corrupt and file will readable
@@ -351,6 +362,10 @@ private static void printMetaDataImpl(final String filename,
                            ? "Proleptic Gregorian"
                            : "Julian/Gregorian"));
     System.out.println("Type: " + reader.getSchema().toString());
+    if (printColumnType) {
+      System.out.println("Columns type:");
+      printColumnsType(reader.getSchema());
+    }
     printTypeAnnotations(reader.getSchema(), "root");
     System.out.println("\nStripe Statistics:");
     List<StripeStatistics> stripeStats = reader.getStripeStatistics();
@@ -835,6 +850,11 @@ static Options createOptions() {
         .desc("specify a backup path to store the corrupted files (default: /tmp)")
         .hasArg()
         .build());
+
+    result.addOption(Option.builder()
+        .longOpt("column-type")
+        .desc("Print the column id, name and type of each column")
+        .build());
     return result;
   }
 

diff --git a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
@@ -22,6 +22,7 @@
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DateColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
@@ -58,6 +59,7 @@
 import java.nio.file.Paths;
 import java.sql.Timestamp;
 import java.text.SimpleDateFormat;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
@@ -827,6 +829,74 @@ public void testDoubleNaNAndInfinite() throws Exception {
     assertEquals("{\"x\":12.34}", lines[2]);
   }
 
+  @Test
+  public void testDumpColumnType() throws Exception {
+    TypeDescription schema =
+        TypeDescription.fromString("struct<a:boolean,b:tinyint,c:smallint,d:int,e:bigint," +
+            "f:float,g:double,h:string,i:date,j:timestamp,k:binary,l:decimal(20,5),m:varchar(5)," +
+            "n:char(5)>");
+    Writer writer = OrcFile.createWriter(testFilePath,
+        OrcFile.writerOptions(conf)
+            .fileSystem(fs)
+            .setSchema(schema));
+
+    VectorizedRowBatch batch = schema.createRowBatch();
+    LongColumnVector a = (LongColumnVector) batch.cols[0];
+    LongColumnVector b = (LongColumnVector) batch.cols[1];
+    LongColumnVector c = (LongColumnVector) batch.cols[2];
+    LongColumnVector d = (LongColumnVector) batch.cols[3];
+    LongColumnVector e = (LongColumnVector) batch.cols[4];
+    DoubleColumnVector f = (DoubleColumnVector) batch.cols[5];
+    DoubleColumnVector g = (DoubleColumnVector) batch.cols[6];
+    BytesColumnVector h = (BytesColumnVector) batch.cols[7];
+    DateColumnVector i = (DateColumnVector) batch.cols[8];
+    TimestampColumnVector j = (TimestampColumnVector) batch.cols[9];
+    BytesColumnVector k = (BytesColumnVector) batch.cols[10];
+    DecimalColumnVector l = (DecimalColumnVector) batch.cols[11];
+    BytesColumnVector m = (BytesColumnVector) batch.cols[12];
+    BytesColumnVector n = (BytesColumnVector) batch.cols[13];
+
+    for (int o = 0; o < VectorizedRowBatch.DEFAULT_SIZE * 2; o++) {
+      int row = batch.size++;
+      a.vector[row] = row % 2;
+      b.vector[row] = row % 128;
+      c.vector[row] = row;
+      d.vector[row] = row;
+      e.vector[row] = row * 10000000L;
+      f.vector[row] = row * 1.0f;
+      g.vector[row] = row * 1.0d;
+      byte[] bytes = String.valueOf(row).getBytes(StandardCharsets.UTF_8);
+      h.setRef(row, bytes, 0, bytes.length);
+      i.vector[row] = row;
+      j.time[row] = row * 1000L;
+      j.nanos[row] = row;
+      k.setRef(row, bytes, 0, bytes.length);
+      l.vector[row] = new HiveDecimalWritable(row);
+      m.setRef(row, bytes, 0, bytes.length);
+      bytes = String.valueOf(10000 - row).getBytes(StandardCharsets.UTF_8);
+      n.setRef(row, bytes, 0, bytes.length);
+
+      if (batch.size == batch.getMaxSize()) {
+        writer.addRowBatch(batch);
+        batch.reset();
+      }
+    }
+    writer.close();
+    assertEquals(VectorizedRowBatch.DEFAULT_SIZE * 2, writer.getNumberOfRows());
+
+    PrintStream origOut = System.out;
+    String outputFilename = "orc-file-dump-column-type.out";
+    FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
+
+    // replace stdout and run command
+    System.setOut(new PrintStream(myOut, false, StandardCharsets.UTF_8.toString()));
+    FileDump.main(new String[]{testFilePath.toString(), "--column-type"});
+    System.out.flush();
+    System.setOut(origOut);
+
+    checkOutput(outputFilename, workDir + File.separator + outputFilename);
+  }
+
   private static boolean contentEquals(String filePath, String otherFilePath) throws IOException {
     try (InputStream is = new BufferedInputStream(new FileInputStream(filePath));
          InputStream otherIs = new BufferedInputStream(new FileInputStream(otherFilePath))) {

diff --git a/java/tools/src/test/resources/orc-file-dump-column-type.out b/java/tools/src/test/resources/orc-file-dump-column-type.out
@@ -0,0 +1,121 @@
+Structure for TestFileDump.testDump.orc
+File Version: 0.12 with ORC_14 by ORC Java 2.1.0-SNAPSHOT
+Rows: 2048
+Compression: ZSTD
+Compression size: 262144
+Calendar: Julian/Gregorian
+Type: struct<a:boolean,b:tinyint,c:smallint,d:int,e:bigint,f:float,g:double,h:string,i:date,j:timestamp,k:binary,l:decimal(20,5),m:varchar(5),n:char(5)>
+Columns type:
+  Column 0: field: 0 type: struct<a:boolean,b:tinyint,c:smallint,d:int,e:bigint,f:float,g:double,h:string,i:date,j:timestamp,k:binary,l:decimal(20,5),m:varchar(5),n:char(5)>
+  Column 1: field: a type: boolean
+  Column 2: field: b type: tinyint
+  Column 3: field: c type: smallint
+  Column 4: field: d type: int
+  Column 5: field: e type: bigint
+  Column 6: field: f type: float
+  Column 7: field: g type: double
+  Column 8: field: h type: string
+  Column 9: field: i type: date
+  Column 10: field: j type: timestamp
+  Column 11: field: k type: binary
+  Column 12: field: l type: decimal(20,5)
+  Column 13: field: m type: varchar(5)
+  Column 14: field: n type: char(5)
+
+Stripe Statistics:
+  Stripe 1:
+    Column 0: count: 2048 hasNull: false
+    Column 1: count: 2048 hasNull: false bytesOnDisk: 7 true: 1024
+    Column 2: count: 2048 hasNull: false bytesOnDisk: 152 min: 0 max: 127 sum: 130048
+    Column 3: count: 2048 hasNull: false bytesOnDisk: 21 min: 0 max: 1023 sum: 1047552
+    Column 4: count: 2048 hasNull: false bytesOnDisk: 21 min: 0 max: 1023 sum: 1047552
+    Column 5: count: 2048 hasNull: false bytesOnDisk: 35 min: 0 max: 10230000000 sum: 10475520000000
+    Column 6: count: 2048 hasNull: false bytesOnDisk: 2361 min: 0.0 max: 1023.0 sum: 1047552.0
+    Column 7: count: 2048 hasNull: false bytesOnDisk: 973 min: 0.0 max: 1023.0 sum: 1047552.0
+    Column 8: count: 2048 hasNull: false bytesOnDisk: 2988 min: 0 max: 999 sum: 5972
+    Column 9: count: 2048 hasNull: false bytesOnDisk: 21 min: Hybrid AD 1970-01-01 max: Hybrid AD 1972-10-20
+    Column 10: count: 2048 hasNull: false bytesOnDisk: 1626 min: 1969-12-31 16:00:00.0 max: 1969-12-31 16:17:03.000001023
+    Column 11: count: 2048 hasNull: false bytesOnDisk: 1404 sum: 5972
+    Column 12: count: 2048 hasNull: false bytesOnDisk: 1666 min: 0 max: 1023 sum: 1047552
+    Column 13: count: 2048 hasNull: false bytesOnDisk: 2988 min: 0 max: 999 sum: 5972
+    Column 14: count: 2048 hasNull: false bytesOnDisk: 1277 min: 10000 max: 9999  sum: 10240
+
+File Statistics:
+  Column 0: count: 2048 hasNull: false
+  Column 1: count: 2048 hasNull: false bytesOnDisk: 7 true: 1024
+  Column 2: count: 2048 hasNull: false bytesOnDisk: 152 min: 0 max: 127 sum: 130048
+  Column 3: count: 2048 hasNull: false bytesOnDisk: 21 min: 0 max: 1023 sum: 1047552
+  Column 4: count: 2048 hasNull: false bytesOnDisk: 21 min: 0 max: 1023 sum: 1047552
+  Column 5: count: 2048 hasNull: false bytesOnDisk: 35 min: 0 max: 10230000000 sum: 10475520000000
+  Column 6: count: 2048 hasNull: false bytesOnDisk: 2361 min: 0.0 max: 1023.0 sum: 1047552.0
+  Column 7: count: 2048 hasNull: false bytesOnDisk: 973 min: 0.0 max: 1023.0 sum: 1047552.0
+  Column 8: count: 2048 hasNull: false bytesOnDisk: 2988 min: 0 max: 999 sum: 5972
+  Column 9: count: 2048 hasNull: false bytesOnDisk: 21 min: Hybrid AD 1970-01-01 max: Hybrid AD 1972-10-20
+  Column 10: count: 2048 hasNull: false bytesOnDisk: 1626 min: 1969-12-31 16:00:00.0 max: 1969-12-31 16:17:03.000001023
+  Column 11: count: 2048 hasNull: false bytesOnDisk: 1404 sum: 5972
+  Column 12: count: 2048 hasNull: false bytesOnDisk: 1666 min: 0 max: 1023 sum: 1047552
+  Column 13: count: 2048 hasNull: false bytesOnDisk: 2988 min: 0 max: 999 sum: 5972
+  Column 14: count: 2048 hasNull: false bytesOnDisk: 1277 min: 10000 max: 9999  sum: 10240
+
+Stripes:
+  Stripe: offset: 3 data: 15540 rows: 2048 tail: 225 index: 464
+    Stream: column 0 section ROW_INDEX start: 3 length 12
+    Stream: column 1 section ROW_INDEX start: 15 length 24
+    Stream: column 2 section ROW_INDEX start: 39 length 28
+    Stream: column 3 section ROW_INDEX start: 67 length 28
+    Stream: column 4 section ROW_INDEX start: 95 length 28
+    Stream: column 5 section ROW_INDEX start: 123 length 35
+    Stream: column 6 section ROW_INDEX start: 158 length 45
+    Stream: column 7 section ROW_INDEX start: 203 length 45
+    Stream: column 8 section ROW_INDEX start: 248 length 30
+    Stream: column 9 section ROW_INDEX start: 278 length 24
+    Stream: column 10 section ROW_INDEX start: 302 length 35
+    Stream: column 11 section ROW_INDEX start: 337 length 24
+    Stream: column 12 section ROW_INDEX start: 361 length 39
+    Stream: column 13 section ROW_INDEX start: 400 length 30
+    Stream: column 14 section ROW_INDEX start: 430 length 37
+    Stream: column 1 section DATA start: 467 length 7
+    Stream: column 2 section DATA start: 474 length 152
+    Stream: column 3 section DATA start: 626 length 21
+    Stream: column 4 section DATA start: 647 length 21
+    Stream: column 5 section DATA start: 668 length 35
+    Stream: column 6 section DATA start: 703 length 2361
+    Stream: column 7 section DATA start: 3064 length 973
+    Stream: column 8 section DATA start: 4037 length 1575
+    Stream: column 8 section LENGTH start: 5612 length 47
+    Stream: column 8 section DICTIONARY_DATA start: 5659 length 1366
+    Stream: column 9 section DATA start: 7025 length 21
+    Stream: column 10 section DATA start: 7046 length 35
+    Stream: column 10 section SECONDARY start: 7081 length 1591
+    Stream: column 11 section DATA start: 8672 length 1368
+    Stream: column 11 section LENGTH start: 10040 length 36
+    Stream: column 12 section DATA start: 10076 length 1647
+    Stream: column 12 section SECONDARY start: 11723 length 19
+    Stream: column 13 section DATA start: 11742 length 1575
+    Stream: column 13 section LENGTH start: 13317 length 47
+    Stream: column 13 section DICTIONARY_DATA start: 13364 length 1366
+    Stream: column 14 section DATA start: 14730 length 753
+    Stream: column 14 section LENGTH start: 15483 length 11
+    Stream: column 14 section DICTIONARY_DATA start: 15494 length 513
+    Encoding column 0: DIRECT
+    Encoding column 1: DIRECT
+    Encoding column 2: DIRECT
+    Encoding column 3: DIRECT_V2
+    Encoding column 4: DIRECT_V2
+    Encoding column 5: DIRECT_V2
+    Encoding column 6: DIRECT
+    Encoding column 7: DIRECT
+    Encoding column 8: DICTIONARY_V2[1024]
+    Encoding column 9: DIRECT_V2
+    Encoding column 10: DIRECT_V2
+    Encoding column 11: DIRECT_V2
+    Encoding column 12: DIRECT_V2
+    Encoding column 13: DICTIONARY_V2[1024]
+    Encoding column 14: DICTIONARY_V2[1024]
+
+File length: 16919 bytes
+File raw data size: 1048404 bytes
+Padding length: 0 bytes
+Padding ratio: 0%
+________________________________________________________________________________________________________________________
+
diff --git a/site/_docs/java-tools.md b/site/_docs/java-tools.md
@@ -142,6 +142,9 @@ equivalent to the Hive ORC File Dump command.
 `--backup-path <path>`
   : when used with --recover specifies the path where the recovered file is written (default: /tmp)
 
+`--column-type`
+  : Print the column id, name and type of each column
+
 `-d,--data`
   : Should the data be printed