Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Jsonl support #1260

Merged
merged 7 commits into from
Jan 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,6 @@ bin/
nbproject/private/
nbactions.xml
nb-configuration.xml

# VSCode
.vscode/
5 changes: 3 additions & 2 deletions docs-src/main/userguide/importing_data.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@ Tablesaw supports importing and exporting data to and from a variety of data typ
| :----------------------------- | :----- | :----- |
| CSV (and other delimited text) | Yes | Yes |
| JSON | Yes | Yes |
| JSONL | Yes | Yes |
| RDBMS (via JDBC) | Yes | |
| Fixed Width Text | Yes | Yes |
| Excel | Yes | |
| Excel | Yes | Yes |
| HTML | Yes | Yes |

## Importing data
Expand Down Expand Up @@ -260,4 +261,4 @@ List<Table> tables= xlsxReader.readMultiple(options);



TODO
TODO
184 changes: 184 additions & 0 deletions json/src/main/java/tech/tablesaw/io/jsonl/JsonlReadOptions.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
package tech.tablesaw.io.jsonl;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.net.URL;
import java.time.format.DateTimeFormatter;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.function.Function;
import tech.tablesaw.api.ColumnType;
import tech.tablesaw.io.ReadOptions;
import tech.tablesaw.io.Source;

public class JsonlReadOptions extends ReadOptions {
benmccann marked this conversation as resolved.
Show resolved Hide resolved

private final String path;

protected JsonlReadOptions(Builder builder) {
super(builder);
this.path = builder.path;
}

public static Builder builder(Source source) {
return new Builder(source);
}

public static Builder builder(File file) {
return new Builder(file).tableName(file.getName());
}

public static Builder builder(String fileName) {
return new Builder(new File(fileName));
}

public static Builder builder(URL url) throws IOException {
return new Builder(url);
}

public static Builder builderFromFile(String fileName) {
return new Builder(new File(fileName));
}

public static Builder builderFromString(String contents) {
return new Builder(new StringReader(contents));
}

public static Builder builderFromUrl(String url) throws IOException {
return new Builder(new URL(url));
}

public static Builder builder(InputStream stream) {
return new Builder(stream);
}

public static Builder builder(Reader reader) {
return new Builder(reader);
}

public String path() {
return path;
}

public static class Builder extends ReadOptions.Builder {

private String path;

protected Builder(Source source) {
super(source);
}

protected Builder(URL url) throws IOException {
super(url);
}

public Builder(File file) {
super(file);
}

protected Builder(Reader reader) {
super(reader);
}

protected Builder(InputStream stream) {
super(stream);
}

@Override
public JsonlReadOptions build() {
return new JsonlReadOptions(this);
}

// Override super-class setters to return an instance of this class

@Override
public Builder header(boolean header) {
super.header(header);
return this;
}

@Override
public Builder tableName(String tableName) {
super.tableName(tableName);
return this;
}

@Override
public Builder sample(boolean sample) {
super.sample(sample);
return this;
}

@Override
public Builder dateFormat(DateTimeFormatter dateFormat) {
super.dateFormat(dateFormat);
return this;
}

@Override
public Builder timeFormat(DateTimeFormatter timeFormat) {
super.timeFormat(timeFormat);
return this;
}

@Override
public Builder dateTimeFormat(DateTimeFormatter dateTimeFormat) {
super.dateTimeFormat(dateTimeFormat);
return this;
}

@Override
public Builder locale(Locale locale) {
super.locale(locale);
return this;
}

@Override
public Builder missingValueIndicator(String... missingValueIndicators) {
super.missingValueIndicator(missingValueIndicators);
return this;
}

@Override
public Builder minimizeColumnSizes() {
super.minimizeColumnSizes();
return this;
}

/**
* @param path the JSON Pointer path used to select a sub-tree in the main document
*/
public Builder path(String path) {
this.path = path;
return this;
}

@Override
public Builder columnTypes(ColumnType[] columnTypes) {
super.columnTypes(columnTypes);
return this;
}

@Override
public Builder columnTypes(Function<String, ColumnType> columnTypeFunction) {
super.columnTypes(columnTypeFunction);
return this;
}

@Override
public Builder columnTypesPartial(Function<String, Optional<ColumnType>> columnTypeFunction) {
super.columnTypesPartial(columnTypeFunction);
return this;
}

@Override
public Builder columnTypesPartial(Map<String, ColumnType> columnTypeByName) {
super.columnTypesPartial(columnTypeByName);
return this;
}
}
}
127 changes: 127 additions & 0 deletions json/src/main/java/tech/tablesaw/io/jsonl/JsonlReader.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
package tech.tablesaw.io.jsonl;

import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectReader;
import com.github.wnameless.json.flattener.JsonFlattener;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import tech.tablesaw.api.Table;
import tech.tablesaw.io.DataReader;
import tech.tablesaw.io.ReadOptions;
import tech.tablesaw.io.ReaderRegistry;
import tech.tablesaw.io.RuntimeIOException;
import tech.tablesaw.io.Source;
import tech.tablesaw.io.TableBuildingUtils;

public class JsonlReader implements DataReader<JsonlReadOptions> {

private static final JsonlReader INSTANCE = new JsonlReader();
private static final ObjectMapper mapper = new ObjectMapper();

static {
register(Table.defaultReaderRegistry);
}

public static void register(ReaderRegistry registry) {
// no standard exists yet. taken from https://murex.rocks/types/jsonl.html#more-information
registry.registerExtension("jsonl", INSTANCE);
registry.registerMimeType("application/json-lines", INSTANCE);
registry.registerMimeType("application/jsonl", INSTANCE);
registry.registerMimeType("application/jsonlines", INSTANCE);
registry.registerMimeType("application/ldjson", INSTANCE);
registry.registerMimeType("application/ndjson", INSTANCE);
registry.registerMimeType("application/x-json-lines", INSTANCE);
registry.registerMimeType("application/x-jsonl", INSTANCE);
registry.registerMimeType("application/x-jsonlines", INSTANCE);
registry.registerMimeType("application/x-ldjson", INSTANCE);
registry.registerMimeType("application/x-ndjson", INSTANCE);
registry.registerMimeType("text/json-lines", INSTANCE);
registry.registerMimeType("text/jsonl", INSTANCE);
registry.registerMimeType("text/jsonlines", INSTANCE);
registry.registerMimeType("text/ldjson", INSTANCE);
registry.registerMimeType("text/ndjson", INSTANCE);
registry.registerMimeType("text/x-json-lines", INSTANCE);
registry.registerMimeType("text/x-jsonl", INSTANCE);
registry.registerMimeType("text/x-jsonlines", INSTANCE);
registry.registerMimeType("text/x-ldjson", INSTANCE);
registry.registerMimeType("text/x-ndjson", INSTANCE);
registry.registerOptions(JsonlReadOptions.class, INSTANCE);
}

@Override
public Table read(JsonlReadOptions options) {
ObjectReader stream = mapper.readerFor(JsonNode.class);
try {
Reader reader = options.source().createReader(null);
JsonParser parser = stream.createParser(reader);
Iterator<JsonNode> iter = stream.readValues(parser);
return convertObjects(iter, options);
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}

private Table convertObjects(Iterator<JsonNode> iter, ReadOptions options) {
// flatten each object inside the array
StringBuilder result = new StringBuilder("[");
boolean first = true;
for (; iter.hasNext(); ) {
JsonNode rowObj = iter.next();
String flattenedRow = null;
try {
flattenedRow = JsonFlattener.flatten(mapper.writeValueAsString(rowObj));
} catch (JsonProcessingException e) {
throw new RuntimeIOException(e);
}
if (!first) {
result.append(",");
}
first = false;
result.append(flattenedRow);
}
String flattenedJsonString = result.append("]").toString();
JsonNode flattenedJsonObj = null;
try {
flattenedJsonObj = mapper.readTree(flattenedJsonString);
} catch (JsonProcessingException e) {
throw new RuntimeIOException(e);
}

Set<String> colNames = new LinkedHashSet<>();
for (JsonNode row : flattenedJsonObj) {
Iterator<String> fieldNames = row.fieldNames();
while (fieldNames.hasNext()) {
colNames.add(fieldNames.next());
}
}

List<String> columnNames = new ArrayList<>(colNames);
List<String[]> dataRows = new ArrayList<>();
for (JsonNode node : flattenedJsonObj) {
String[] arr = new String[columnNames.size()];
for (int i = 0; i < columnNames.size(); i++) {
if (node.has(columnNames.get(i))) {
arr[i] = node.get(columnNames.get(i)).asText();
} else {
arr[i] = null;
}
}
dataRows.add(arr);
}

return TableBuildingUtils.build(columnNames, dataRows, options);
}

@Override
public Table read(Source source) {
return read(JsonlReadOptions.builder(source).build());
}
}
31 changes: 31 additions & 0 deletions json/src/main/java/tech/tablesaw/io/jsonl/JsonlWriteOptions.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package tech.tablesaw.io.jsonl;

import java.io.Writer;
import tech.tablesaw.io.Destination;
import tech.tablesaw.io.WriteOptions;

public class JsonlWriteOptions extends WriteOptions {

private JsonlWriteOptions(Builder builder) {
super(builder);
}

public static Builder builder(Writer writer) {
return new Builder(new Destination(writer));
}

public static Builder builder(Destination destination) {
return new Builder(destination);
}

public static class Builder extends WriteOptions.Builder {

protected Builder(Destination destination) {
super(destination);
}

public JsonlWriteOptions build() {
return new JsonlWriteOptions(this);
}
}
}
Loading
Loading