Skip to content

Commit

Permalink
ORC-1742: Suppor print the id, name and type of each column in dump tool
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
This PR aims to suppor print the id, name and type of each column in dump tool.

### Why are the changes needed?
When we dump an ORC with a complex structure, we only output the column id of each column, but we do not know the column name and type corresponding to the column id.
If we use json format, it will output the id, name and type of each column.

### How was this patch tested?
Add UT

### Was this patch authored or co-authored using generative AI tooling?
No

Closes #1974 from cxzl25/ORC-1742.

Authored-by: sychen <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
  • Loading branch information
cxzl25 authored and dongjoon-hyun committed Jul 11, 2024
1 parent a9e0351 commit 8ca3a23
Show file tree
Hide file tree
Showing 4 changed files with 218 additions and 4 deletions.
28 changes: 24 additions & 4 deletions java/tools/src/java/org/apache/orc/tools/FileDump.java
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,9 @@ public static void main(Configuration conf, String[] args) throws Exception {
boolean prettyPrint = cli.hasOption('p');
JsonFileDump.printJsonMetaData(filesInPath, conf, rowIndexCols, prettyPrint, printTimeZone);
} else {
printMetaData(filesInPath, conf, rowIndexCols, printTimeZone, recover, backupPath);
boolean printColumnType = cli.hasOption("column-type");
printMetaData(filesInPath, conf, rowIndexCols, printTimeZone, recover, backupPath,
printColumnType);
}
}
}
Expand Down Expand Up @@ -268,11 +270,11 @@ public static Collection<String> getAllFilesInPath(final Path path,

private static void printMetaData(List<String> files, Configuration conf,
List<Integer> rowIndexCols, boolean printTimeZone, final boolean recover,
final String backupPath)
final String backupPath, final boolean printColumnType)
throws IOException {
List<String> corruptFiles = new ArrayList<>();
for (String filename : files) {
printMetaDataImpl(filename, conf, rowIndexCols, printTimeZone, corruptFiles);
printMetaDataImpl(filename, conf, rowIndexCols, printTimeZone, corruptFiles, printColumnType);
System.out.println(SEPARATOR);
}

Expand All @@ -294,6 +296,15 @@ private static void printMetaData(List<String> files, Configuration conf,
}
}

static void printColumnsType(TypeDescription schema) {
int maximumId = schema.getMaximumId();
for (int c = schema.getId(); c < maximumId + 1; ++c) {
TypeDescription type = schema.findSubtype(c);
System.out.println(" Column " + type.getId() + ": field: " + type.getFullFieldName() +
" type: " + type.toString());
}
}

static void printTypeAnnotations(TypeDescription type, String prefix) {
List<String> attributes = type.getAttributeNames();
if (attributes.size() > 0) {
Expand Down Expand Up @@ -329,7 +340,7 @@ static void printTypeAnnotations(TypeDescription type, String prefix) {

private static void printMetaDataImpl(final String filename,
final Configuration conf, List<Integer> rowIndexCols, final boolean printTimeZone,
final List<String> corruptFiles) throws IOException {
final List<String> corruptFiles, final boolean printColumnType) throws IOException {
Path file = new Path(filename);
Reader reader = getReader(file, conf, corruptFiles);
// if we can create reader then footer is not corrupt and file will readable
Expand All @@ -351,6 +362,10 @@ private static void printMetaDataImpl(final String filename,
? "Proleptic Gregorian"
: "Julian/Gregorian"));
System.out.println("Type: " + reader.getSchema().toString());
if (printColumnType) {
System.out.println("Columns type:");
printColumnsType(reader.getSchema());
}
printTypeAnnotations(reader.getSchema(), "root");
System.out.println("\nStripe Statistics:");
List<StripeStatistics> stripeStats = reader.getStripeStatistics();
Expand Down Expand Up @@ -835,6 +850,11 @@ static Options createOptions() {
.desc("specify a backup path to store the corrupted files (default: /tmp)")
.hasArg()
.build());

result.addOption(Option.builder()
.longOpt("column-type")
.desc("Print the column id, name and type of each column")
.build());
return result;
}

Expand Down
70 changes: 70 additions & 0 deletions java/tools/src/test/org/apache/orc/tools/TestFileDump.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DateColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
Expand Down Expand Up @@ -58,6 +59,7 @@
import java.nio.file.Paths;
import java.sql.Timestamp;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
Expand Down Expand Up @@ -827,6 +829,74 @@ public void testDoubleNaNAndInfinite() throws Exception {
assertEquals("{\"x\":12.34}", lines[2]);
}

@Test
public void testDumpColumnType() throws Exception {
TypeDescription schema =
TypeDescription.fromString("struct<a:boolean,b:tinyint,c:smallint,d:int,e:bigint," +
"f:float,g:double,h:string,i:date,j:timestamp,k:binary,l:decimal(20,5),m:varchar(5)," +
"n:char(5)>");
Writer writer = OrcFile.createWriter(testFilePath,
OrcFile.writerOptions(conf)
.fileSystem(fs)
.setSchema(schema));

VectorizedRowBatch batch = schema.createRowBatch();
LongColumnVector a = (LongColumnVector) batch.cols[0];
LongColumnVector b = (LongColumnVector) batch.cols[1];
LongColumnVector c = (LongColumnVector) batch.cols[2];
LongColumnVector d = (LongColumnVector) batch.cols[3];
LongColumnVector e = (LongColumnVector) batch.cols[4];
DoubleColumnVector f = (DoubleColumnVector) batch.cols[5];
DoubleColumnVector g = (DoubleColumnVector) batch.cols[6];
BytesColumnVector h = (BytesColumnVector) batch.cols[7];
DateColumnVector i = (DateColumnVector) batch.cols[8];
TimestampColumnVector j = (TimestampColumnVector) batch.cols[9];
BytesColumnVector k = (BytesColumnVector) batch.cols[10];
DecimalColumnVector l = (DecimalColumnVector) batch.cols[11];
BytesColumnVector m = (BytesColumnVector) batch.cols[12];
BytesColumnVector n = (BytesColumnVector) batch.cols[13];

for (int o = 0; o < VectorizedRowBatch.DEFAULT_SIZE * 2; o++) {
int row = batch.size++;
a.vector[row] = row % 2;
b.vector[row] = row % 128;
c.vector[row] = row;
d.vector[row] = row;
e.vector[row] = row * 10000000L;
f.vector[row] = row * 1.0f;
g.vector[row] = row * 1.0d;
byte[] bytes = String.valueOf(row).getBytes(StandardCharsets.UTF_8);
h.setRef(row, bytes, 0, bytes.length);
i.vector[row] = row;
j.time[row] = row * 1000L;
j.nanos[row] = row;
k.setRef(row, bytes, 0, bytes.length);
l.vector[row] = new HiveDecimalWritable(row);
m.setRef(row, bytes, 0, bytes.length);
bytes = String.valueOf(10000 - row).getBytes(StandardCharsets.UTF_8);
n.setRef(row, bytes, 0, bytes.length);

if (batch.size == batch.getMaxSize()) {
writer.addRowBatch(batch);
batch.reset();
}
}
writer.close();
assertEquals(VectorizedRowBatch.DEFAULT_SIZE * 2, writer.getNumberOfRows());

PrintStream origOut = System.out;
String outputFilename = "orc-file-dump-column-type.out";
FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);

// replace stdout and run command
System.setOut(new PrintStream(myOut, false, StandardCharsets.UTF_8.toString()));
FileDump.main(new String[]{testFilePath.toString(), "--column-type"});
System.out.flush();
System.setOut(origOut);

checkOutput(outputFilename, workDir + File.separator + outputFilename);
}

private static boolean contentEquals(String filePath, String otherFilePath) throws IOException {
try (InputStream is = new BufferedInputStream(new FileInputStream(filePath));
InputStream otherIs = new BufferedInputStream(new FileInputStream(otherFilePath))) {
Expand Down
121 changes: 121 additions & 0 deletions java/tools/src/test/resources/orc-file-dump-column-type.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
Structure for TestFileDump.testDump.orc
File Version: 0.12 with ORC_14 by ORC Java 2.1.0-SNAPSHOT
Rows: 2048
Compression: ZSTD
Compression size: 262144
Calendar: Julian/Gregorian
Type: struct<a:boolean,b:tinyint,c:smallint,d:int,e:bigint,f:float,g:double,h:string,i:date,j:timestamp,k:binary,l:decimal(20,5),m:varchar(5),n:char(5)>
Columns type:
Column 0: field: 0 type: struct<a:boolean,b:tinyint,c:smallint,d:int,e:bigint,f:float,g:double,h:string,i:date,j:timestamp,k:binary,l:decimal(20,5),m:varchar(5),n:char(5)>
Column 1: field: a type: boolean
Column 2: field: b type: tinyint
Column 3: field: c type: smallint
Column 4: field: d type: int
Column 5: field: e type: bigint
Column 6: field: f type: float
Column 7: field: g type: double
Column 8: field: h type: string
Column 9: field: i type: date
Column 10: field: j type: timestamp
Column 11: field: k type: binary
Column 12: field: l type: decimal(20,5)
Column 13: field: m type: varchar(5)
Column 14: field: n type: char(5)

Stripe Statistics:
Stripe 1:
Column 0: count: 2048 hasNull: false
Column 1: count: 2048 hasNull: false bytesOnDisk: 7 true: 1024
Column 2: count: 2048 hasNull: false bytesOnDisk: 152 min: 0 max: 127 sum: 130048
Column 3: count: 2048 hasNull: false bytesOnDisk: 21 min: 0 max: 1023 sum: 1047552
Column 4: count: 2048 hasNull: false bytesOnDisk: 21 min: 0 max: 1023 sum: 1047552
Column 5: count: 2048 hasNull: false bytesOnDisk: 35 min: 0 max: 10230000000 sum: 10475520000000
Column 6: count: 2048 hasNull: false bytesOnDisk: 2361 min: 0.0 max: 1023.0 sum: 1047552.0
Column 7: count: 2048 hasNull: false bytesOnDisk: 973 min: 0.0 max: 1023.0 sum: 1047552.0
Column 8: count: 2048 hasNull: false bytesOnDisk: 2988 min: 0 max: 999 sum: 5972
Column 9: count: 2048 hasNull: false bytesOnDisk: 21 min: Hybrid AD 1970-01-01 max: Hybrid AD 1972-10-20
Column 10: count: 2048 hasNull: false bytesOnDisk: 1626 min: 1969-12-31 16:00:00.0 max: 1969-12-31 16:17:03.000001023
Column 11: count: 2048 hasNull: false bytesOnDisk: 1404 sum: 5972
Column 12: count: 2048 hasNull: false bytesOnDisk: 1666 min: 0 max: 1023 sum: 1047552
Column 13: count: 2048 hasNull: false bytesOnDisk: 2988 min: 0 max: 999 sum: 5972
Column 14: count: 2048 hasNull: false bytesOnDisk: 1277 min: 10000 max: 9999 sum: 10240

File Statistics:
Column 0: count: 2048 hasNull: false
Column 1: count: 2048 hasNull: false bytesOnDisk: 7 true: 1024
Column 2: count: 2048 hasNull: false bytesOnDisk: 152 min: 0 max: 127 sum: 130048
Column 3: count: 2048 hasNull: false bytesOnDisk: 21 min: 0 max: 1023 sum: 1047552
Column 4: count: 2048 hasNull: false bytesOnDisk: 21 min: 0 max: 1023 sum: 1047552
Column 5: count: 2048 hasNull: false bytesOnDisk: 35 min: 0 max: 10230000000 sum: 10475520000000
Column 6: count: 2048 hasNull: false bytesOnDisk: 2361 min: 0.0 max: 1023.0 sum: 1047552.0
Column 7: count: 2048 hasNull: false bytesOnDisk: 973 min: 0.0 max: 1023.0 sum: 1047552.0
Column 8: count: 2048 hasNull: false bytesOnDisk: 2988 min: 0 max: 999 sum: 5972
Column 9: count: 2048 hasNull: false bytesOnDisk: 21 min: Hybrid AD 1970-01-01 max: Hybrid AD 1972-10-20
Column 10: count: 2048 hasNull: false bytesOnDisk: 1626 min: 1969-12-31 16:00:00.0 max: 1969-12-31 16:17:03.000001023
Column 11: count: 2048 hasNull: false bytesOnDisk: 1404 sum: 5972
Column 12: count: 2048 hasNull: false bytesOnDisk: 1666 min: 0 max: 1023 sum: 1047552
Column 13: count: 2048 hasNull: false bytesOnDisk: 2988 min: 0 max: 999 sum: 5972
Column 14: count: 2048 hasNull: false bytesOnDisk: 1277 min: 10000 max: 9999 sum: 10240

Stripes:
Stripe: offset: 3 data: 15540 rows: 2048 tail: 225 index: 464
Stream: column 0 section ROW_INDEX start: 3 length 12
Stream: column 1 section ROW_INDEX start: 15 length 24
Stream: column 2 section ROW_INDEX start: 39 length 28
Stream: column 3 section ROW_INDEX start: 67 length 28
Stream: column 4 section ROW_INDEX start: 95 length 28
Stream: column 5 section ROW_INDEX start: 123 length 35
Stream: column 6 section ROW_INDEX start: 158 length 45
Stream: column 7 section ROW_INDEX start: 203 length 45
Stream: column 8 section ROW_INDEX start: 248 length 30
Stream: column 9 section ROW_INDEX start: 278 length 24
Stream: column 10 section ROW_INDEX start: 302 length 35
Stream: column 11 section ROW_INDEX start: 337 length 24
Stream: column 12 section ROW_INDEX start: 361 length 39
Stream: column 13 section ROW_INDEX start: 400 length 30
Stream: column 14 section ROW_INDEX start: 430 length 37
Stream: column 1 section DATA start: 467 length 7
Stream: column 2 section DATA start: 474 length 152
Stream: column 3 section DATA start: 626 length 21
Stream: column 4 section DATA start: 647 length 21
Stream: column 5 section DATA start: 668 length 35
Stream: column 6 section DATA start: 703 length 2361
Stream: column 7 section DATA start: 3064 length 973
Stream: column 8 section DATA start: 4037 length 1575
Stream: column 8 section LENGTH start: 5612 length 47
Stream: column 8 section DICTIONARY_DATA start: 5659 length 1366
Stream: column 9 section DATA start: 7025 length 21
Stream: column 10 section DATA start: 7046 length 35
Stream: column 10 section SECONDARY start: 7081 length 1591
Stream: column 11 section DATA start: 8672 length 1368
Stream: column 11 section LENGTH start: 10040 length 36
Stream: column 12 section DATA start: 10076 length 1647
Stream: column 12 section SECONDARY start: 11723 length 19
Stream: column 13 section DATA start: 11742 length 1575
Stream: column 13 section LENGTH start: 13317 length 47
Stream: column 13 section DICTIONARY_DATA start: 13364 length 1366
Stream: column 14 section DATA start: 14730 length 753
Stream: column 14 section LENGTH start: 15483 length 11
Stream: column 14 section DICTIONARY_DATA start: 15494 length 513
Encoding column 0: DIRECT
Encoding column 1: DIRECT
Encoding column 2: DIRECT
Encoding column 3: DIRECT_V2
Encoding column 4: DIRECT_V2
Encoding column 5: DIRECT_V2
Encoding column 6: DIRECT
Encoding column 7: DIRECT
Encoding column 8: DICTIONARY_V2[1024]
Encoding column 9: DIRECT_V2
Encoding column 10: DIRECT_V2
Encoding column 11: DIRECT_V2
Encoding column 12: DIRECT_V2
Encoding column 13: DICTIONARY_V2[1024]
Encoding column 14: DICTIONARY_V2[1024]

File length: 16919 bytes
File raw data size: 1048404 bytes
Padding length: 0 bytes
Padding ratio: 0%
________________________________________________________________________________________________________________________

3 changes: 3 additions & 0 deletions site/_docs/java-tools.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,9 @@ equivalent to the Hive ORC File Dump command.
`--backup-path <path>`
: when used with --recover specifies the path where the recovered file is written (default: /tmp)

`--column-type`
: Print the column id, name and type of each column

`-d,--data`
: Should the data be printed

Expand Down

0 comments on commit 8ca3a23

Please sign in to comment.