Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Jsonl support #1260

Merged
merged 7 commits into from
Jan 9, 2025
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,6 @@ bin/
nbproject/private/
nbactions.xml
nb-configuration.xml

# VSCode
.vscode/
184 changes: 184 additions & 0 deletions json/src/main/java/tech/tablesaw/io/jsonl/JsonlReadOptions.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
package tech.tablesaw.io.jsonl;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.net.URL;
import java.time.format.DateTimeFormatter;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.function.Function;
import tech.tablesaw.api.ColumnType;
import tech.tablesaw.io.ReadOptions;
import tech.tablesaw.io.Source;

public class JsonlReadOptions extends ReadOptions {
benmccann marked this conversation as resolved.
Show resolved Hide resolved

private final String path;

protected JsonlReadOptions(Builder builder) {
super(builder);
this.path = builder.path;
}

public static Builder builder(Source source) {
return new Builder(source);
}

public static Builder builder(File file) {
return new Builder(file).tableName(file.getName());
}

public static Builder builder(String fileName) {
return new Builder(new File(fileName));
}

public static Builder builder(URL url) throws IOException {
return new Builder(url);
}

public static Builder builderFromFile(String fileName) {
return new Builder(new File(fileName));
}

public static Builder builderFromString(String contents) {
return new Builder(new StringReader(contents));
}

public static Builder builderFromUrl(String url) throws IOException {
return new Builder(new URL(url));
}

public static Builder builder(InputStream stream) {
return new Builder(stream);
}

public static Builder builder(Reader reader) {
return new Builder(reader);
}

public String path() {
return path;
}

public static class Builder extends ReadOptions.Builder {

private String path;

protected Builder(Source source) {
super(source);
}

protected Builder(URL url) throws IOException {
super(url);
}

public Builder(File file) {
super(file);
}

protected Builder(Reader reader) {
super(reader);
}

protected Builder(InputStream stream) {
super(stream);
}

@Override
public JsonlReadOptions build() {
return new JsonlReadOptions(this);
}

// Override super-class setters to return an instance of this class

@Override
public Builder header(boolean header) {
super.header(header);
return this;
}

@Override
public Builder tableName(String tableName) {
super.tableName(tableName);
return this;
}

@Override
public Builder sample(boolean sample) {
super.sample(sample);
return this;
}

@Override
public Builder dateFormat(DateTimeFormatter dateFormat) {
super.dateFormat(dateFormat);
return this;
}

@Override
public Builder timeFormat(DateTimeFormatter timeFormat) {
super.timeFormat(timeFormat);
return this;
}

@Override
public Builder dateTimeFormat(DateTimeFormatter dateTimeFormat) {
super.dateTimeFormat(dateTimeFormat);
return this;
}

@Override
public Builder locale(Locale locale) {
super.locale(locale);
return this;
}

@Override
public Builder missingValueIndicator(String... missingValueIndicators) {
super.missingValueIndicator(missingValueIndicators);
return this;
}

@Override
public Builder minimizeColumnSizes() {
super.minimizeColumnSizes();
return this;
}

/**
* @param path the JSON Pointer path used to select a sub-tree in the main document
*/
public Builder path(String path) {
this.path = path;
return this;
}

@Override
public Builder columnTypes(ColumnType[] columnTypes) {
super.columnTypes(columnTypes);
return this;
}

@Override
public Builder columnTypes(Function<String, ColumnType> columnTypeFunction) {
super.columnTypes(columnTypeFunction);
return this;
}

@Override
public Builder columnTypesPartial(Function<String, Optional<ColumnType>> columnTypeFunction) {
super.columnTypesPartial(columnTypeFunction);
return this;
}

@Override
public Builder columnTypesPartial(Map<String, ColumnType> columnTypeByName) {
super.columnTypesPartial(columnTypeByName);
return this;
}
}
}
108 changes: 108 additions & 0 deletions json/src/main/java/tech/tablesaw/io/jsonl/JsonlReader.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
package tech.tablesaw.io.jsonl;

import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectReader;
import com.github.wnameless.json.flattener.JsonFlattener;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import tech.tablesaw.api.Table;
import tech.tablesaw.io.DataReader;
import tech.tablesaw.io.ReadOptions;
import tech.tablesaw.io.ReaderRegistry;
import tech.tablesaw.io.RuntimeIOException;
import tech.tablesaw.io.Source;
import tech.tablesaw.io.TableBuildingUtils;

public class JsonlReader implements DataReader<JsonlReadOptions> {

private static final JsonlReader INSTANCE = new JsonlReader();
private static final ObjectMapper mapper = new ObjectMapper();

static {
register(Table.defaultReaderRegistry);
}

public static void register(ReaderRegistry registry) {
registry.registerExtension("jsonl", INSTANCE);
registry.registerMimeType("text/jsonl", INSTANCE);
registry.registerMimeType("application/jsonl+json", INSTANCE);
dsyer marked this conversation as resolved.
Show resolved Hide resolved
registry.registerOptions(JsonlReadOptions.class, INSTANCE);
}

@Override
public Table read(JsonlReadOptions options) {
ObjectReader stream = mapper.readerFor(JsonNode.class);
try {
Reader reader = options.source().createReader(null);
JsonParser parser = stream.createParser(reader);
Iterator<JsonNode> iter = stream.readValues(parser);
return convertObjects(iter, options);
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}

private Table convertObjects(Iterator<JsonNode> iter, ReadOptions options) {
// flatten each object inside the array
StringBuilder result = new StringBuilder("[");
boolean first = true;
for (; iter.hasNext(); ) {
JsonNode rowObj = iter.next();
String flattenedRow = null;
try {
flattenedRow = JsonFlattener.flatten(mapper.writeValueAsString(rowObj));
} catch (JsonProcessingException e) {
throw new RuntimeIOException(e);
}
if (!first) {
result.append(",");
}
first = false;
result.append(flattenedRow);
}
String flattenedJsonString = result.append("]").toString();
JsonNode flattenedJsonObj = null;
try {
flattenedJsonObj = mapper.readTree(flattenedJsonString);
} catch (JsonProcessingException e) {
throw new RuntimeIOException(e);
}

Set<String> colNames = new LinkedHashSet<>();
for (JsonNode row : flattenedJsonObj) {
Iterator<String> fieldNames = row.fieldNames();
while (fieldNames.hasNext()) {
colNames.add(fieldNames.next());
}
}

List<String> columnNames = new ArrayList<>(colNames);
List<String[]> dataRows = new ArrayList<>();
for (JsonNode node : flattenedJsonObj) {
String[] arr = new String[columnNames.size()];
for (int i = 0; i < columnNames.size(); i++) {
if (node.has(columnNames.get(i))) {
arr[i] = node.get(columnNames.get(i)).asText();
} else {
arr[i] = null;
}
}
dataRows.add(arr);
}

return TableBuildingUtils.build(columnNames, dataRows, options);
}

@Override
public Table read(Source source) {
return read(JsonlReadOptions.builder(source).build());
}
}
31 changes: 31 additions & 0 deletions json/src/main/java/tech/tablesaw/io/jsonl/JsonlWriteOptions.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package tech.tablesaw.io.jsonl;

import java.io.Writer;
import tech.tablesaw.io.Destination;
import tech.tablesaw.io.WriteOptions;

public class JsonlWriteOptions extends WriteOptions {

private JsonlWriteOptions(Builder builder) {
super(builder);
}

public static Builder builder(Writer writer) {
return new Builder(new Destination(writer));
}

public static Builder builder(Destination destination) {
return new Builder(destination);
}

public static class Builder extends WriteOptions.Builder {

protected Builder(Destination destination) {
super(destination);
}

public JsonlWriteOptions build() {
return new JsonlWriteOptions(this);
}
}
}
66 changes: 66 additions & 0 deletions json/src/main/java/tech/tablesaw/io/jsonl/JsonlWriter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package tech.tablesaw.io.jsonl;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
import java.io.IOException;
import java.io.Writer;
import tech.tablesaw.api.Table;
import tech.tablesaw.io.DataWriter;
import tech.tablesaw.io.Destination;
import tech.tablesaw.io.RuntimeIOException;
import tech.tablesaw.io.WriterRegistry;

public class JsonlWriter implements DataWriter<JsonlWriteOptions> {

private static final JsonlWriter INSTANCE = new JsonlWriter();
private static final ObjectMapper mapper =
new ObjectMapper().registerModule(new JavaTimeModule());

static {
register(Table.defaultWriterRegistry);
}

public static void register(WriterRegistry registry) {
registry.registerExtension("jsonl", INSTANCE);
registry.registerOptions(JsonlWriteOptions.class, INSTANCE);
}

public void write(Table table, JsonlWriteOptions options) {
try (Writer writer = options.destination().createWriter()) {
for (int r = 0; r < table.rowCount(); r++) {
ObjectNode row = mapper.createObjectNode();
for (int c = 0; c < table.columnCount(); c++) {
row.set(table.column(c).name(), mapper.convertValue(table.get(r, c), JsonNode.class));
}
String str = mapper.writeValueAsString(row);
writer.write(str);
if (r < table.rowCount() - 1) {
writer.write("\n");
}
}
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}

@Override
public void write(Table table, Destination dest) {
write(table, JsonlWriteOptions.builder(dest).build());
}
}
Loading
Loading