Skip to content

Commit

Permalink
add MySql Bulk Loader transform, apache#2365
Browse files Browse the repository at this point in the history
  • Loading branch information
hansva committed Feb 12, 2025
1 parent 9c6e5a0 commit 966a4b9
Show file tree
Hide file tree
Showing 24 changed files with 3,057 additions and 20 deletions.
6 changes: 6 additions & 0 deletions assemblies/plugins/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -1273,6 +1273,12 @@
<version>${project.version}</version>
<type>zip</type>
</dependency>
<dependency>
<groupId>org.apache.hop</groupId>
<artifactId>hop-transform-mysqlbulkloader</artifactId>
<version>${project.version}</version>
<type>zip</type>
</dependency>
<dependency>
<groupId>org.apache.hop</groupId>
<artifactId>hop-transform-normaliser</artifactId>
Expand Down
37 changes: 19 additions & 18 deletions docker/integration-tests/Dockerfile.unit-tests
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,16 @@ ENV DEBIAN_FRONTEND=noninteractive
# this removed the need to calculate the necessary heap Xmx
ENV HOP_OPTIONS=-XX:+AggressiveHeap

# Set Locale correctly
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US:en
ENV LC_ALL en_US.UTF-8

# INSTALL REQUIRED PACKAGES AND ADJUST LOCALE
# procps: The package includes the programs ps, top, vmstat, w, kill, free, slabtop, and skill

# Copy the hop package from the local resources folder to the container image directory

RUN apt-get update \
&& apt-get install --assume-yes \
bash \
Expand All @@ -58,35 +65,29 @@ RUN apt-get update \
&& addgroup -gid ${JENKINS_GID} ${JENKINS_GROUP} \
&& useradd -m -d /home/${JENKINS_USER} -u ${JENKINS_UID} -g ${JENKINS_GROUP} ${JENKINS_USER} \
&& chown ${JENKINS_USER}:${JENKINS_GROUP} ${DEPLOYMENT_PATH} \
&& chown ${JENKINS_USER}:${JENKINS_GROUP} ${VOLUME_MOUNT_POINT}

# Set Locale correctly
RUN sed -i '/en_US.UTF-8/s/^# //g' /etc/locale.gen && \
locale-gen
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US:en
ENV LC_ALL en_US.UTF-8
&& chown ${JENKINS_USER}:${JENKINS_GROUP} ${VOLUME_MOUNT_POINT} \
&& sed -i '/en_US.UTF-8/s/^# //g' /etc/locale.gen \
&& locale-gen \
# Install parquet-tools from Python
&& pip3 install parquet-tools

# Install parquet-tools from Python

RUN pip3 install parquet-tools

# Copy the hop package from the local resources folder to the container image directory

COPY --chown=${JENKINS_USER}:${JENKINS_GROUP} ./assemblies/client/target/hop ${DEPLOYMENT_PATH}/hop

# Copy gcp key
COPY --chown=${JENKINS_USER}:${JENKINS_GROUP} ${GCP_KEY_FILE} /tmp/google-key-apache-hop-it.json

# Unzip and install in correct location

RUN chown -R ${JENKINS_USER}:${JENKINS_GROUP} ${DEPLOYMENT_PATH}/hop \
&& chmod 700 ${DEPLOYMENT_PATH}/hop/*.sh \
&& cd ${DEPLOYMENT_PATH}/hop \
&& ./hop-conf.sh --generate-fat-jar=/tmp/hop-fatjar.jar \
# Download the Vertica JDBC driver
&& wget -c https://repo1.maven.org/maven2/com/vertica/jdbc/vertica-jdbc/23.4.0-0/vertica-jdbc-23.4.0-0.jar -O /opt/hop/lib/jdbc/vertica-jdbc-23.4.0-0.jar \
&& wget -c https://repo1.maven.org/maven2/org/openjdk/nashorn/nashorn-core/15.4/nashorn-core-15.4.jar -O /opt/hop/plugins/transforms/script/lib/nashorn-core-15.4.jar
&& ./hop-conf.sh --generate-fat-jar=/tmp/hop-fatjar.jar


# Download Additional drivers/dependencies
ADD --chown=${JENKINS_USER}:${JENKINS_GROUP} https://repo1.maven.org/maven2/com/vertica/jdbc/vertica-jdbc/23.4.0-0/vertica-jdbc-23.4.0-0.jar /opt/hop/lib/jdbc/vertica-jdbc-23.4.0-0.jar
ADD --chown=${JENKINS_USER}:${JENKINS_GROUP} https://repo1.maven.org/maven2/com/mysql/mysql-connector-j/9.2.0/mysql-connector-j-9.2.0.jar /opt/hop/lib/jdbc/mysql-connector-j-9.2.0.jar
ADD --chown=${JENKINS_USER}:${JENKINS_GROUP} https://repo1.maven.org/maven2/org/openjdk/nashorn/nashorn-core/15.4/nashorn-core-15.4.jar /opt/hop/plugins/transforms/script/lib/nashorn-core-15.4.jar

# make volume available so that hop pipeline and workflow files can be provided easily
VOLUME ["/files"]
Expand Down
11 changes: 10 additions & 1 deletion docker/integration-tests/integration-tests-database.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,13 @@ services:
interval: 20s
timeout: 10s
retries: 6
start_period: 120s
start_period: 120s

mysql:
image: mysql:9.2.0
ports:
- "127.0.0.1:3306:3306"
environment:
- MYSQL_ROOT_PASSWORD=my-secret-pw
volumes:
- ./resource/mysql/my.cnf:/etc/mysql/conf.d/my.cnf
19 changes: 19 additions & 0 deletions docker/integration-tests/resource/mysql/my.cnf
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

[mysqld]
local-infile
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions docs/hop-user-manual/modules/ROOT/nav.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ under the License.
*** xref:pipeline/transforms/mongodbinput.adoc[MongoDB Input]
*** xref:pipeline/transforms/mongodboutput.adoc[MongoDB Output]
*** xref:pipeline/transforms/multimerge.adoc[Multiway Merge Join]
*** xref:pipeline/transforms/mysqlbulkloader.adoc[MySql Bulk Loader]
*** xref:pipeline/transforms/neo4j-cypher.adoc[Neo4j Cypher]
*** xref:pipeline/transforms/neo4j-gencsv.adoc[Neo4j Generate CSVs]
*** xref:pipeline/transforms/neo4j-getloginfo.adoc[Neo4j Get Logging Info]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ The pages nested under this topic contain information on how to use the transfor
* xref:pipeline/transforms/mongodbinput.adoc[MongoDB Input]
* xref:pipeline/transforms/mongodboutput.adoc[MongoDB Output]
* xref:pipeline/transforms/multimerge.adoc[Multiway Merge Join]
* xref:pipeline/transforms/mysqlbulkloader.adoc[MySql Bulk Loader]
* xref:pipeline/transforms/neo4j-cypher.adoc[Neo4j Cypher]
* xref:pipeline/transforms/neo4j-gencsv.adoc[Neo4j Generate CSVs]
* xref:pipeline/transforms/neo4j-getloginfo.adoc[Neo4j Get Logging Info]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
////
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
////
:documentationPath: /pipeline/transforms/
:language: en_US
:description: The MySql Bulk Loader transform uses the copy command to load data as opposed to sending individual insert statements

= image:transforms/icons/mysqlbulkloader.svg[MySql Bulk loader transform Icon, role="image-doc-icon"] MySql Bulk Loader

[%noheader,cols="3a,1a", role="table-no-borders" ]
|===
|
== Description

The MySql Bulk Loader transform uses the copy command to load data as opposed to sending individual insert statements.

It will create a local file which will then be loaded using the `LOAD DATA` command. More information https://dev.mysql.com/doc/refman/9.2/en/load-data.html[here]
|
== Supported Engines
[%noheader,cols="2,1a",frame=none, role="table-supported-engines"]
!===
!Hop Engine! image:check_mark.svg[Supported, 24]
!Spark! image:question_mark.svg[Maybe Supported, 24]
!Flink! image:question_mark.svg[Maybe Supported, 24]
!Dataflow! image:question_mark.svg[Maybe Supported, 24]
!===
|===

== Options

NOTE: Bulk loading must be enabled on both server and client in the client connection add following option `allowLoadLocalInfile=true`.
On the server side the following query should return "ON" `show global variables like 'local_infile';`

=== General

[%header, width="90%", cols="1,4"]
|===
|option|description
|Connection| The database connection to use when bulk loading
|Target Schema| (Optional) The schema containing the table being loaded.
|Target Table| The name of the table being loaded.
|Fifo file| Temporary file location
|Delimiter|Delimiter that determines the field.
|Enclosure|You can specify an enclosure string which when placed around a value allows delimiters to be present in it.
|Escape character|To include delimiter characters in values sometimes an escape string is used like backslash, double backslash and so on.
|Character set|The used character set (optional).
|Bulk size (rows)|This will split the data load in multiple chucks.
|Use replace clause|With REPLACE, new rows that have the same value as a unique key value in an existing row replace the existing row.
|Use Ignore clause|With IGNORE, new rows that duplicate an existing row on a unique key value are discarded.
|Local data|If LOCAL is not specified, the file must be located on the server host.
|===

=== Fields

[%header, width="90%", cols="1,4"]
|===
|option|description
|Table field|Name of the field in the table.
|Stream field|Name of the field in the stream.
|Field format OK? a|You can decide if the format should be kept (Don't change formatting) or changed:

* Format as Date (yyyy-MM-dd)
* Format as a timestamp (yyyy-MM-dd HH:mm:ss)
* Format as Number (grouping symbol is "," - decimal is ".")
* Escape enclosure characters when found
|===
Loading

0 comments on commit 966a4b9

Please sign in to comment.