Skip to content

Commit

Permalink
ESQL: LTRIM, RTRIM and fix unicode whitespace (#98590)
Browse files Browse the repository at this point in the history
Here we add support for the following two ESQL functions:
* LTRIM: remove leading spaces from a string
* RTRIM: remove trailing spaces from a string

We also fix an issue with the handling of unicode white spaces. We
make use of unicode code points to identify unicode whitespace
characters instead of relying on ASCII codes.

Moreover, iterating bytes in a Unicode string needs to consider
that some Unicode characters are encoded using multiple bytes.
  • Loading branch information
nik9000 authored Aug 17, 2023
1 parent aad16b7 commit a380e8c
Show file tree
Hide file tree
Showing 18 changed files with 605 additions and 75 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
"Distributed",
"Downsampling",
"EQL",
"ES|QL",
"Engine",
"FIPS",
"Features",
Expand Down
5 changes: 5 additions & 0 deletions docs/changelog/98590.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 98590
summary: "ESQL: LTRIM, RTRIM and fix unicode whitespace"
area: ES|QL
type: feature
issues: []
4 changes: 4 additions & 0 deletions docs/reference/esql/esql-functions.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ these functions:
* <<esql-is_nan>>
* <<esql-length>>
* <<esql-log10>>
* <<esql-ltrim>>
* <<esql-rtrim>>
* <<esql-mv_avg>>
* <<esql-mv_concat>>
* <<esql-mv_count>>
Expand Down Expand Up @@ -85,6 +87,8 @@ include::functions/is_infinite.asciidoc[]
include::functions/is_nan.asciidoc[]
include::functions/length.asciidoc[]
include::functions/log10.asciidoc[]
include::functions/ltrim.asciidoc[]
include::functions/rtrim.asciidoc[]
include::functions/mv_avg.asciidoc[]
include::functions/mv_concat.asciidoc[]
include::functions/mv_count.asciidoc[]
Expand Down
12 changes: 12 additions & 0 deletions docs/reference/esql/functions/ltrim.asciidoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[[esql-ltrim]]
=== `LTRIM`
Removes leading whitespaces from strings.

[source.merge.styled,esql]
----
include::{esql-specs}/string.csv-spec[tag=ltrim]
----
[%header.monospaced.styled,format=dsv,separator=|]
|===
include::{esql-specs}/string.csv-spec[tag=ltrim-result]
|===
12 changes: 12 additions & 0 deletions docs/reference/esql/functions/rtrim.asciidoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[[esql-rtrim]]
=== `RTRIM`
Removes trailing whitespaces from strings.

[source.merge.styled,esql]
----
include::{esql-specs}/string.csv-spec[tag=rtrim]
----
[%header.monospaced.styled,format=dsv,separator=|]
|===
include::{esql-specs}/string.csv-spec[tag=rtrim-result]
|===
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ is_infinite |is_infinite(arg1)
is_nan |is_nan(arg1)
length |length(arg1)
log10 |log10(arg1)
ltrim |ltrim(arg1)
max |max(arg1)
median |median(arg1)
median_absolute_deviation|median_absolute_deviation(arg1)
Expand All @@ -51,6 +52,7 @@ percentile |percentile(arg1, arg2)
pi |pi()
pow |pow(arg1, arg2)
round |round(arg1, arg2)
rtrim |rtrim(arg1)
sin |sin(arg1)
sinh |sinh(arg1)
split |split(arg1, arg2)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,25 +165,91 @@ emp_no:integer | last_name:keyword | x:keyword | z:keyword
10010 | Piveteau | P | a
;

ltrim
from employees | sort emp_no | limit 10 | eval name = concat(" ", first_name, " ") | eval name = ltrim(name) | eval name = concat("'", name, "'") | keep emp_no, name;

emp_no:integer | name:keyword
10001 | 'Georgi '
10002 | 'Bezalel '
10003 | 'Parto '
10004 | 'Chirstian '
10005 | 'Kyoichi '
10006 | 'Anneke '
10007 | 'Tzvetan '
10008 | 'Saniya '
10009 | 'Sumant '
10010 | 'Duangkaew '
;

ltrimRow
// tag::ltrim[]
ROW message = " some text ", color = " red "
| EVAL message = LTRIM(message)
| EVAL color = LTRIM(color)
| EVAL message = CONCAT("'", message, "'")
| EVAL color = CONCAT("'", color, "'")
// end::ltrim[]
;

// tag::ltrim-result[]
message:keyword | color:keyword
'some text ' | 'red '
// end::ltrim-result[]
;

rtrim
from employees | sort emp_no | limit 10 | eval name = concat(" ", first_name, " ") | eval name = rtrim(name) | eval name = concat("'", name, "'") | keep emp_no, name;

emp_no:integer | name:keyword
10001 | ' Georgi'
10002 | ' Bezalel'
10003 | ' Parto'
10004 | ' Chirstian'
10005 | ' Kyoichi'
10006 | ' Anneke'
10007 | ' Tzvetan'
10008 | ' Saniya'
10009 | ' Sumant'
10010 | ' Duangkaew'
;

rtrimRow
// tag::rtrim[]
ROW message = " some text ", color = " red "
| EVAL message = RTRIM(message)
| EVAL color = RTRIM(color)
| EVAL message = CONCAT("'", message, "'")
| EVAL color = CONCAT("'", color, "'")
// end::rtrim[]
;

// tag::rtrim-result[]
message:keyword | color:keyword
' some text' | ' red'
// end::rtrim-result[]
;

trim
from employees | sort emp_no | limit 10 | eval name = concat(" ", first_name) | eval name = trim(first_name) | keep emp_no, name;
from employees | sort emp_no | limit 10 | eval name = concat(" ", first_name, " ") | eval name = trim(name) | eval name = concat("'", name, "'") | keep emp_no, name;

emp_no:integer | name:keyword
10001 | Georgi
10002 | Bezalel
10003 | Parto
10004 | Chirstian
10005 | Kyoichi
10006 | Anneke
10007 | Tzvetan
10008 | Saniya
10009 | Sumant
10010 | Duangkaew
10001 | 'Georgi'
10002 | 'Bezalel'
10003 | 'Parto'
10004 | 'Chirstian'
10005 | 'Kyoichi'
10006 | 'Anneke'
10007 | 'Tzvetan'
10008 | 'Saniya'
10009 | 'Sumant'
10010 | 'Duangkaew'
;

trimRow
// tag::trim[]
ROW message = " some text ", color = " red "| EVAL message = TRIM(message)| EVAL color = TRIM(color)
ROW message = " some text ", color = " red "
| EVAL message = TRIM(message)
| EVAL color = TRIM(color)
// end::trim[]
;

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
// or more contributor license agreements. Licensed under the Elastic License
// 2.0; you may not use this file except in compliance with the Elastic License
// 2.0.
package org.elasticsearch.xpack.esql.expression.function.scalar.string;

import java.lang.Override;
import java.lang.String;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.compute.data.Block;
import org.elasticsearch.compute.data.BytesRefBlock;
import org.elasticsearch.compute.data.BytesRefVector;
import org.elasticsearch.compute.data.Page;
import org.elasticsearch.compute.operator.EvalOperator;

/**
* {@link EvalOperator.ExpressionEvaluator} implementation for {@link LTrim}.
* This class is generated. Do not edit it.
*/
public final class LTrimEvaluator implements EvalOperator.ExpressionEvaluator {
private final EvalOperator.ExpressionEvaluator val;

public LTrimEvaluator(EvalOperator.ExpressionEvaluator val) {
this.val = val;
}

@Override
public Block eval(Page page) {
Block valUncastBlock = val.eval(page);
if (valUncastBlock.areAllValuesNull()) {
return Block.constantNullBlock(page.getPositionCount());
}
BytesRefBlock valBlock = (BytesRefBlock) valUncastBlock;
BytesRefVector valVector = valBlock.asVector();
if (valVector == null) {
return eval(page.getPositionCount(), valBlock);
}
return eval(page.getPositionCount(), valVector).asBlock();
}

public BytesRefBlock eval(int positionCount, BytesRefBlock valBlock) {
BytesRefBlock.Builder result = BytesRefBlock.newBlockBuilder(positionCount);
BytesRef valScratch = new BytesRef();
position: for (int p = 0; p < positionCount; p++) {
if (valBlock.isNull(p) || valBlock.getValueCount(p) != 1) {
result.appendNull();
continue position;
}
result.appendBytesRef(LTrim.process(valBlock.getBytesRef(valBlock.getFirstValueIndex(p), valScratch)));
}
return result.build();
}

public BytesRefVector eval(int positionCount, BytesRefVector valVector) {
BytesRefVector.Builder result = BytesRefVector.newVectorBuilder(positionCount);
BytesRef valScratch = new BytesRef();
position: for (int p = 0; p < positionCount; p++) {
result.appendBytesRef(LTrim.process(valVector.getBytesRef(p, valScratch)));
}
return result.build();
}

@Override
public String toString() {
return "LTrimEvaluator[" + "val=" + val + "]";
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
// or more contributor license agreements. Licensed under the Elastic License
// 2.0; you may not use this file except in compliance with the Elastic License
// 2.0.
package org.elasticsearch.xpack.esql.expression.function.scalar.string;

import java.lang.Override;
import java.lang.String;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.compute.data.Block;
import org.elasticsearch.compute.data.BytesRefBlock;
import org.elasticsearch.compute.data.BytesRefVector;
import org.elasticsearch.compute.data.Page;
import org.elasticsearch.compute.operator.EvalOperator;

/**
* {@link EvalOperator.ExpressionEvaluator} implementation for {@link RTrim}.
* This class is generated. Do not edit it.
*/
public final class RTrimEvaluator implements EvalOperator.ExpressionEvaluator {
private final EvalOperator.ExpressionEvaluator val;

public RTrimEvaluator(EvalOperator.ExpressionEvaluator val) {
this.val = val;
}

@Override
public Block eval(Page page) {
Block valUncastBlock = val.eval(page);
if (valUncastBlock.areAllValuesNull()) {
return Block.constantNullBlock(page.getPositionCount());
}
BytesRefBlock valBlock = (BytesRefBlock) valUncastBlock;
BytesRefVector valVector = valBlock.asVector();
if (valVector == null) {
return eval(page.getPositionCount(), valBlock);
}
return eval(page.getPositionCount(), valVector).asBlock();
}

public BytesRefBlock eval(int positionCount, BytesRefBlock valBlock) {
BytesRefBlock.Builder result = BytesRefBlock.newBlockBuilder(positionCount);
BytesRef valScratch = new BytesRef();
position: for (int p = 0; p < positionCount; p++) {
if (valBlock.isNull(p) || valBlock.getValueCount(p) != 1) {
result.appendNull();
continue position;
}
result.appendBytesRef(RTrim.process(valBlock.getBytesRef(valBlock.getFirstValueIndex(p), valScratch)));
}
return result.build();
}

public BytesRefVector eval(int positionCount, BytesRefVector valVector) {
BytesRefVector.Builder result = BytesRefVector.newVectorBuilder(positionCount);
BytesRef valScratch = new BytesRef();
position: for (int p = 0; p < positionCount; p++) {
result.appendBytesRef(RTrim.process(valVector.getBytesRef(p, valScratch)));
}
return result.build();
}

@Override
public String toString() {
return "RTrimEvaluator[" + "val=" + val + "]";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,9 @@
import org.elasticsearch.xpack.esql.expression.function.scalar.multivalue.MvMin;
import org.elasticsearch.xpack.esql.expression.function.scalar.multivalue.MvSum;
import org.elasticsearch.xpack.esql.expression.function.scalar.string.Concat;
import org.elasticsearch.xpack.esql.expression.function.scalar.string.LTrim;
import org.elasticsearch.xpack.esql.expression.function.scalar.string.Length;
import org.elasticsearch.xpack.esql.expression.function.scalar.string.RTrim;
import org.elasticsearch.xpack.esql.expression.function.scalar.string.Split;
import org.elasticsearch.xpack.esql.expression.function.scalar.string.StartsWith;
import org.elasticsearch.xpack.esql.expression.function.scalar.string.Substring;
Expand Down Expand Up @@ -129,6 +131,8 @@ private FunctionDefinition[][] functions() {
def(Length.class, Length::new, "length"),
def(Substring.class, Substring::new, "substring"),
def(Concat.class, Concat::new, "concat"),
def(LTrim.class, LTrim::new, "ltrim"),
def(RTrim.class, RTrim::new, "rtrim"),
def(Trim.class, Trim::new, "trim"),
def(StartsWith.class, StartsWith::new, "starts_with") },
// date
Expand Down
Loading

0 comments on commit a380e8c

Please sign in to comment.