diff --git a/cli/build.gradle b/cli/build.gradle index 7fa96f5f20..b075286a84 100644 --- a/cli/build.gradle +++ b/cli/build.gradle @@ -28,6 +28,8 @@ dependencies { implementation 'net.sf.jopt-simple:jopt-simple:[5.0,6.0)' implementation 'org.jetbrains.kotlin:kotlin-stdlib-jdk8' + implementation 'org.apache.commons:commons-csv:1.8' + testImplementation 'org.junit.vintage:junit-vintage-engine:5.7.0' } diff --git a/cli/src/org/partiql/cli/functions/ReadFile.kt b/cli/src/org/partiql/cli/functions/ReadFile.kt index 7227d27626..9b6b3fe2f2 100644 --- a/cli/src/org/partiql/cli/functions/ReadFile.kt +++ b/cli/src/org/partiql/cli/functions/ReadFile.kt @@ -15,6 +15,7 @@ package org.partiql.cli.functions import com.amazon.ion.IonStruct +import org.apache.commons.csv.CSVFormat import org.partiql.lang.eval.Environment import org.partiql.lang.eval.ExprValue import org.partiql.lang.eval.ExprValueFactory @@ -34,14 +35,30 @@ internal class ReadFile(valueFactory: ExprValueFactory) : BaseFunction(valueFact ConversionMode.values().find { it.name.toLowerCase() == name } ?: throw IllegalArgumentException( "Unknown conversion: $name") - private fun delimitedReadHandler(delimiter: Char): (InputStream, IonStruct) -> ExprValue = { input, options -> + private fun fileReadHandler(csvFormat: CSVFormat): (InputStream, IonStruct) -> ExprValue = { input, options -> val encoding = options["encoding"]?.stringValue() ?: "UTF-8" + val reader = InputStreamReader(input, encoding) val conversion = options["conversion"]?.stringValue() ?: "none" + val hasHeader = options["header"]?.booleanValue() ?: false + val ignoreEmptyLine = options["ignore_empty_line"]?.booleanValue() ?: true + val ignoreSurroundingSpace = options["ignore_surrounding_space"]?.booleanValue() ?: true + val trim = options["trim"]?.booleanValue() ?: true + val delimiter = options["delimiter"]?.stringValue()?.first() // CSVParser library only accepts a single character as delimiter + val record = options["line_breaker"]?.stringValue() + val escape = options["escape"]?.stringValue()?.first() // CSVParser library only accepts a single character as escape + val quote = options["quote"]?.stringValue()?.first() // CSVParser library only accepts a single character as quote - val reader = InputStreamReader(input, encoding) + val csvFormatWithOptions = csvFormat.withIgnoreEmptyLines(ignoreEmptyLine) + .withIgnoreSurroundingSpaces(ignoreSurroundingSpace) + .withTrim(trim) + .let { if (hasHeader) it.withFirstRecordAsHeader() else it } + .let { if (delimiter != null) it.withDelimiter(delimiter) else it } + .let { if (record != null) it.withRecordSeparator(record) else it } + .let { if (escape != null) it.withEscape(escape) else it } + .let { if (quote != null) it.withQuote(quote) else it } - DelimitedValues.exprValue(valueFactory, reader, delimiter, hasHeader, conversionModeFor(conversion)) + DelimitedValues.exprValue(valueFactory, reader, csvFormatWithOptions, conversionModeFor(conversion)) } private fun ionReadHandler(): (InputStream, IonStruct) -> ExprValue = { input, _ -> @@ -50,8 +67,16 @@ internal class ReadFile(valueFactory: ExprValueFactory) : BaseFunction(valueFact private val readHandlers = mapOf( "ion" to ionReadHandler(), - "tsv" to delimitedReadHandler('\t'), - "csv" to delimitedReadHandler(',')) + "csv" to fileReadHandler(CSVFormat.DEFAULT), + "tsv" to fileReadHandler(CSVFormat.DEFAULT.withDelimiter('\t')), + "excel_csv" to fileReadHandler(CSVFormat.EXCEL), + "mysql_csv" to fileReadHandler(CSVFormat.MYSQL), + "mongodb_csv" to fileReadHandler(CSVFormat.MONGODB_CSV), + "mongodb_tsv" to fileReadHandler(CSVFormat.MONGODB_TSV), + "postgresql_csv" to fileReadHandler(CSVFormat.POSTGRESQL_CSV), + "postgresql_text" to fileReadHandler(CSVFormat.POSTGRESQL_TEXT), + "customized" to fileReadHandler(CSVFormat.DEFAULT) + ) override fun call(env: Environment, args: List): ExprValue { val options = optionsStruct(1, args) diff --git a/cli/test/org/partiql/cli/functions/ReadFileTest.kt b/cli/test/org/partiql/cli/functions/ReadFileTest.kt index 43c044d390..e8aad3e622 100644 --- a/cli/test/org/partiql/cli/functions/ReadFileTest.kt +++ b/cli/test/org/partiql/cli/functions/ReadFileTest.kt @@ -156,4 +156,100 @@ class ReadFileTest { assertEquals(ion.singleValue(expected), actual) } + + @Test + fun readExcelCsvFile() { + writeFile("simple_excel.csv", "title,category,price\nharry potter,book,7.99") + + val args = listOf("\"${dirPath("simple_excel.csv")}\"", "{type:\"excel_csv\", header:true}").map { it.exprValue() } + + val actual = function.call(env, args).ionValue + val expected = "[{title:\"harry potter\",category:\"book\",price:\"7.99\"}]" + + assertEquals(ion.singleValue(expected), actual) + } + + @Test + fun readPostgreSQLCsvFile() { + writeFile("simple_postgresql.csv", "id,name,balance\n1,Bob,10000.00") + + val args = listOf("\"${dirPath("simple_postgresql.csv")}\"", "{type:\"postgresql_csv\", header:true}").map { it.exprValue() } + + val actual = function.call(env, args).ionValue + val expected = "[{id:\"1\",name:\"Bob\",balance:\"10000.00\"}]" + + assertEquals(ion.singleValue(expected), actual) + } + + @Test + fun readCustomizedCsvFile1() { // delimiter + writeFile("customized.csv", "id name balance\n1 Bob 10000.00") + + val args = listOf("\"${dirPath("customized.csv")}\"", "{type:\"customized\", header:true, delimiter:' '}").map { it.exprValue() } + + val actual = function.call(env, args).ionValue + val expected = "[{id:\"1\",name:\"Bob\",balance:\"10000.00\"}]" + + assertEquals(ion.singleValue(expected), actual) + } + + @Test + fun readCustomizedCsvFile2() { // ignore_empty_line + writeFile("customized.csv", "id,name,balance\n\n1,Bob,10000.00") + + val args = listOf("\"${dirPath("customized.csv")}\"", "{type:\"customized\", header:true, ignore_empty_line: false}").map { it.exprValue() } + + val actual = function.call(env, args).ionValue + val expected = "[{id:\"\"},{id:\"1\",name:\"Bob\",balance:\"10000.00\"}]" + + assertEquals(ion.singleValue(expected), actual) + } + + @Test + fun readCustomizedCsvFile3() { // trim and ignore_surrounding_space + writeFile("customized.csv", "id,name,balance\n 1 , Bob , 10000.00 ") + + val args = listOf("\"${dirPath("customized.csv")}\"", "{type:\"customized\", header:true, ignore_surrounding_space:false, trim:false}").map { it.exprValue() } + + val actual = function.call(env, args).ionValue + val expected = "[{id:\" 1 \",name:\" Bob \",balance:\" 10000.00 \"}]" + + assertEquals(ion.singleValue(expected), actual) + } + + @Test + fun readCustomizedCsvFile4() { // line_breaker + writeFile("customized.csv", "id,name,balance\r\n1,Bob,10000.00") + + val args = listOf("\"${dirPath("customized.csv")}\"", "{type:\"customized\", header:true, line_breaker:'\\\r\\\n'}").map { it.exprValue() } + + val actual = function.call(env, args).ionValue + val expected = "[{id:\"1\",name:\"Bob\",balance:\"10000.00\"}]" + + assertEquals(ion.singleValue(expected), actual) + } + + @Test + fun readCustomizedCsvFile5() { // escape + writeFile("customized.csv", "id,name,balance\n\"/\"1\",Bob,10000.00") + + val args = listOf("\"${dirPath("customized.csv")}\"", "{type:\"customized\", header:true, escape:'/'}").map { it.exprValue() } + + val actual = function.call(env, args).ionValue + val expected = "[{id:\"\\\"1\",name:\"Bob\",balance:\"10000.00\"}]" + + assertEquals(ion.singleValue(expected), actual) + } + + @Test + fun readCustomizedCsvFile6() { // quote + writeFile("customized.csv", "id,name,balance\n'1,',Bob,10000.00") + + val args = listOf("\"${dirPath("customized.csv")}\"", "{type:\"customized\", header:true, quote:\"'\"}").map { it.exprValue() } + + val actual = function.call(env, args).ionValue + val expected = "[{id:\"1,\",name:\"Bob\",balance:\"10000.00\"}]" + + assertEquals(ion.singleValue(expected), actual) + } } diff --git a/docs/user/CLI.md b/docs/user/CLI.md index 23855bf482..bf8543005f 100644 --- a/docs/user/CLI.md +++ b/docs/user/CLI.md @@ -641,3 +641,35 @@ Kumo dog Mochi dog Lilikoi unicorn ``` + +## Predefined CSV Data + +The `read_file` function provides options to read other predefined CSV data formats. +For example, if a CSV file is exported from PostgreSQL, we can use the following command +to read the file: +``` +read_file('simple_postgresql.csv', {'type':'postgresql_csv'}) +``` +Other available options for the argument `type` besides `postgresql_csv` are `excel_csv`, `mysql_csv`, `mongodb_csv`, `mongodb_tsv`, and `postgresql_text`. + +## Customized CSV Data +The `read_file` function also provides options to read customized CSV data formats. +For example, we have a data file where the whitespace is the separator as shown below: +``` +title category price +harry_potter book 7.99 +dot electronics 49.99 +echo electronics 99.99 +``` +We can use the following command to read the file: +``` +read_file('customized.csv', {'type':'customized', 'delimiter':' ', 'header':true}) +``` +All the available options for customized CSV files are shown as following: +1. Ignore empty lines: `'ignore_empty_line':true` +2. Ignore spaces surrounding comma: `'ignore_surrounding_space':true` +3. Trim leading and trailing blanks: `'trim':true` +4. Set line breaker (only working with '\\r', '\\n' and '\\r\\n'): `'line_breaker: \n'` +5. Set escape sign (single character only): `'escape':'\'` +6. Set quote sign (single character only): `'quote':'"'` +7. Set delimiter sign (single character only): `'delimiter':','` diff --git a/lang/src/org/partiql/lang/eval/io/DelimitedValues.kt b/lang/src/org/partiql/lang/eval/io/DelimitedValues.kt index e83654386c..ed33a6914d 100644 --- a/lang/src/org/partiql/lang/eval/io/DelimitedValues.kt +++ b/lang/src/org/partiql/lang/eval/io/DelimitedValues.kt @@ -59,25 +59,18 @@ object DelimitedValues { * Lazily loads a stream of values from a [Reader] into a sequence backed [ExprValue]. * This does **not** close the [Reader]. * - * @param ion The system to use. * @param input The input source. - * @param delimiter The delimiter to use between columns. - * @param hasHeader Whether the first row of the delimited input defines the columns. + * @param csvFormat What the format of csv files is. * @param conversionMode How column text should be converted. */ @JvmStatic fun exprValue(valueFactory: ExprValueFactory, input: Reader, - delimiter: Char, - hasHeader: Boolean, + csvFormat: CSVFormat, conversionMode: ConversionMode): ExprValue { val reader = BufferedReader(input) - val csvFormat = when (hasHeader){ - true -> CSVFormat.DEFAULT.withDelimiter(delimiter).withFirstRecordAsHeader() - false -> CSVFormat.DEFAULT.withDelimiter(delimiter) - } val csvParser = CSVParser(reader, csvFormat) - val columns: List = csvParser.headerNames // `columns` is an empty list when `hasHeader` is false + val columns: List = csvParser.headerNames val seq = csvParser.asSequence().map { csvRecord -> valueFactory.newStruct( diff --git a/lang/test/org/partiql/lang/eval/io/DelimitedValuesTest.kt b/lang/test/org/partiql/lang/eval/io/DelimitedValuesTest.kt index 997c02734f..c27ed620b8 100644 --- a/lang/test/org/partiql/lang/eval/io/DelimitedValuesTest.kt +++ b/lang/test/org/partiql/lang/eval/io/DelimitedValuesTest.kt @@ -14,6 +14,7 @@ package org.partiql.lang.eval.io +import org.apache.commons.csv.CSVFormat import org.partiql.lang.* import org.partiql.lang.eval.io.DelimitedValues.ConversionMode import org.partiql.lang.eval.io.DelimitedValues.ConversionMode.* @@ -32,10 +33,9 @@ class DelimitedValuesTest : TestBase() { } private fun read(text: String, - delimiter: Char, - hasHeader: Boolean, + csvFormat: CSVFormat, conversionMode: ConversionMode): ExprValue = - DelimitedValues.exprValue(valueFactory, StringReader(text), delimiter, hasHeader, conversionMode) + DelimitedValues.exprValue(valueFactory, StringReader(text), csvFormat, conversionMode) private fun assertWrite(expectedText: String, valueText: String, @@ -78,8 +78,7 @@ class DelimitedValuesTest : TestBase() { """[]""", read( "", - delimiter = ',', - hasHeader = false, + CSVFormat.DEFAULT, conversionMode = NONE ) ) @@ -89,8 +88,7 @@ class DelimitedValuesTest : TestBase() { """[]""", read( "", - delimiter = ',', - hasHeader = false, + CSVFormat.DEFAULT, conversionMode = AUTO ) ) @@ -100,8 +98,7 @@ class DelimitedValuesTest : TestBase() { """[{_1: "1", _2: "2", _3: "3"}]""", read( """1,2,3""", - delimiter = ',', - hasHeader = false, + CSVFormat.DEFAULT, conversionMode = NONE ) ) @@ -119,8 +116,7 @@ class DelimitedValuesTest : TestBase() { |1.0,2e0,2007-10-10T12:00:00Z |hello,{,} """.trimMargin(), - delimiter = ',', - hasHeader = false, + CSVFormat.DEFAULT, conversionMode = AUTO ) ) @@ -139,8 +135,7 @@ class DelimitedValuesTest : TestBase() { |1.0,2e0,2007-10-10T12:00:00Z |hello,{,} """.trimMargin(), - delimiter = ',', - hasHeader = true, + CSVFormat.DEFAULT.withFirstRecordAsHeader(), conversionMode = AUTO ) )