diff --git a/.github/workflows/cicd.yaml b/.github/workflows/cicd.yaml index de76ed04a5f..fd2aaf867fb 100644 --- a/.github/workflows/cicd.yaml +++ b/.github/workflows/cicd.yaml @@ -171,6 +171,10 @@ jobs: run: | echo "SQL_JAVASDK_ENABLE=ON" >> $GITHUB_ENV + - name: Install Dependencies + run: | + brew install z3 + - name: configure env: SQL_JAVASDK_ENABLE: ${{ env.SQL_JAVASDK_ENABLE }} @@ -204,6 +208,13 @@ jobs: path: openmldb-*.tar.gz name: release-artifacts + - name: Upload Event File + if: always() + uses: actions/upload-artifact@v3 + with: + name: event-file + path: ${{ github.event_path }} + release: runs-on: ubuntu-latest # if test failed, shouldn't release diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 6d85fb54792..2e44ddc5328 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -43,12 +43,12 @@ jobs: BUILD_SHARED_LIBS: ON TESTING_ENABLE_STRIP: ON steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: download lcov - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: repository: linux-test-project/lcov ref: v1.16 @@ -73,10 +73,34 @@ jobs: - name: debug if: always() - run: | + run: | du -d 1 -h build df -h + - name: upload coverage + uses: actions/upload-artifact@v3 + with: + # include the generated html report in build/coverage, great for local diagnose + name: coverage-cpp-${{ github.sha }} + path: | + build/coverage.* + build/coverage/ + retention-days: 3 + + coverage-publish: + needs: ["coverage"] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + submodules: true + + - name: Download Artifacts + uses: actions/download-artifact@v3 + with: + name: coverage-cpp-${{ github.sha }} + path: build + - name: Upload Coverage Report uses: codecov/codecov-action@v3 with: @@ -84,7 +108,3 @@ jobs: name: coverage-cpp fail_ci_if_error: true verbose: true - - - name: stop service - run: | - ./steps/ut_zookeeper.sh stop diff --git a/.github/workflows/hybridse-ci.yml b/.github/workflows/hybridse-ci.yml index bd0f722fe39..7da8e5ac100 100644 --- a/.github/workflows/hybridse-ci.yml +++ b/.github/workflows/hybridse-ci.yml @@ -57,7 +57,7 @@ jobs: # this option enables print inner output rows of each node in runner HYBRIDSE_DEBUG: true steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Install dependencies run: | @@ -79,6 +79,13 @@ jobs: run: | make hybridse-test + - name: Upload Event File + if: always() + uses: actions/upload-artifact@v3 + with: + name: event-file + path: ${{ github.event_path }} + - name: Upload Cpp UT Results if: always() uses: actions/upload-artifact@v3 @@ -89,7 +96,7 @@ jobs: publish-test-results-linux: needs: ["linux-build"] - # the action will only run on 4paradigm/HybridSE's context, not for fork repo or dependabot + # the action will only run on 4paradigm/OpenMLDB's context, not for fork repo or dependabot if: > always() && github.event_name == 'push' || ( github.event.pull_request.head.repo.full_name == github.repository && @@ -104,7 +111,7 @@ jobs: publish-test-results-mac: needs: ["macos-build"] - # the action will only run on 4paradigm/HybridSE's context, not for fork repo or dependabot + # the action will only run on 4paradigm/OpenMLDB's context, not for fork repo or dependabot if: > always() && github.event_name == 'push' || ( github.event.pull_request.head.repo.full_name == github.repository && diff --git a/.github/workflows/integration-test-src.yml b/.github/workflows/integration-test-src.yml index 62d35ea6e28..d6fd1cfa526 100644 --- a/.github/workflows/integration-test-src.yml +++ b/.github/workflows/integration-test-src.yml @@ -104,7 +104,6 @@ jobs: html_body: test/integration-test/openmldb-test-java/openmldb-sdk-test/target/surefire-reports/html/overview.html to: ${{ secrets.MAIL_TO }} from: GitHub Actions - content_type: text/plain attachments: surefire-reports.tar.gz java-sdk-cluster-memory-task: @@ -148,7 +147,6 @@ jobs: html_body: file://test/integration-test/openmldb-test-java/openmldb-sdk-test/target/surefire-reports/html/overview.html to: ${{ secrets.MAIL_TO }} from: GitHub Actions - content_type: text/plain attachments: surefire-reports.tar.gz java-sdk-cluster-memory-1: @@ -191,7 +189,6 @@ jobs: body: OpenMLDB Memory 1 Test Failed to: ${{ secrets.MAIL_TO }} from: GitHub Actions - content_type: text/plain attachments: surefire-reports.tar.gz java-sdk-cluster-ssd-0: @@ -234,7 +231,6 @@ jobs: body: OpenMLDB SSD Test Failed to: ${{ secrets.MAIL_TO }} from: GitHub Actions - content_type: text/plain attachments: surefire-reports.tar.gz # java-sdk-cluster-ssd-task: @@ -320,7 +316,6 @@ jobs: body: OpenMLDB HDD Test Failed to: ${{ secrets.MAIL_TO }} from: GitHub Actions - content_type: text/plain attachments: surefire-reports.tar.gz # java-sdk-cluster-hdd-task: @@ -568,5 +563,4 @@ jobs: html_body: test/integration-test/openmldb-test-java/openmldb-sdk-test/target/surefire-reports/html/overview.html to: ${{ secrets.MAIL_TO }} from: GitHub Actions - content_type: text/plain attachments: surefire-reports.tar.gz diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 72cf786ed73..bf8db302935 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -7,10 +7,11 @@ jobs: pr-name-lint: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - uses: actions/setup-node@v3 + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 with: - node-version: '14' + node-version: '18' - name: Install Dependencies - run: npm install @commitlint/config-conventional - - uses: JulienKode/pull-request-name-linter-action@v0.2.0 + run: npm install @commitlint/config-conventional@18.5.0 + + - uses: JulienKode/pull-request-name-linter-action@v0.5.0 diff --git a/.github/workflows/publish-test-result-from-fork.yml b/.github/workflows/publish-cicd-test-result-from-fork.yml similarity index 86% rename from .github/workflows/publish-test-result-from-fork.yml rename to .github/workflows/publish-cicd-test-result-from-fork.yml index 76682644ddf..2074f7a5e07 100644 --- a/.github/workflows/publish-test-result-from-fork.yml +++ b/.github/workflows/publish-cicd-test-result-from-fork.yml @@ -21,3 +21,5 @@ jobs: files: artifacts/linux-ut-result-*/**/*.xml check_name: Linux Test Report comment_title: Linux Test Report + event_name: ${{ github.event.workflow_run.event }} + run_id: ${{ github.event.workflow_run.id }} diff --git a/.github/workflows/publish-hybridse-test-result-from-fork.yml b/.github/workflows/publish-hybridse-test-result-from-fork.yml index ff3a963eed8..0910d654dd2 100644 --- a/.github/workflows/publish-hybridse-test-result-from-fork.yml +++ b/.github/workflows/publish-hybridse-test-result-from-fork.yml @@ -21,6 +21,8 @@ jobs: files: artifacts/linux-ut-result-*/**/*.xml check_name: HybridSE Linux Test Report comment_title: HybridSE Linux Test Report + event_name: ${{ github.event.workflow_run.event }} + run_id: ${{ github.event.workflow_run.id }} test-results-mac: if: > @@ -31,7 +33,9 @@ jobs: name: Publish HybridSE Mac UT Results uses: ./.github/workflows/publish-test-result-from-fork-action.yml with: - commit: ${{ github.event.workflow_run.head_sha }} - files: artifacts/macos-ut-result-*/**/*.xml - check_name: HybridSE Mac Test Report - comment_title: HybridSE Mac Test Report + commit: ${{ github.event.workflow_run.head_sha }} + files: artifacts/macos-ut-result-*/**/*.xml + check_name: HybridSE Mac Test Report + comment_title: HybridSE Mac Test Report + event_name: ${{ github.event.workflow_run.event }} + run_id: ${{ github.event.workflow_run.id }} diff --git a/.github/workflows/publish-sdk-test-result-from-fork.yml b/.github/workflows/publish-sdk-test-result-from-fork.yml index c030f4d3e0b..f99e456b844 100644 --- a/.github/workflows/publish-sdk-test-result-from-fork.yml +++ b/.github/workflows/publish-sdk-test-result-from-fork.yml @@ -21,3 +21,5 @@ jobs: files: artifacts/linux-ut-result-*/**/*.xml check_name: SDK Test Report comment_title: SDK Test Report + event_name: ${{ github.event.workflow_run.event }} + run_id: ${{ github.event.workflow_run.id }} diff --git a/.github/workflows/publish-test-result-from-fork-action.yml b/.github/workflows/publish-test-result-from-fork-action.yml index 9521b823021..44d980f3148 100644 --- a/.github/workflows/publish-test-result-from-fork-action.yml +++ b/.github/workflows/publish-test-result-from-fork-action.yml @@ -7,54 +7,36 @@ on: files: required: true type: string + event_name: + required: true + type: string check_name: required: true type: string comment_title: required: true type: string + run_id: + required: true + type: number + +permissions: + checks: write + pull-requests: write + + # required by download step to access artifacts API + actions: read jobs: test-results: runs-on: ubuntu-latest steps: - - name: Download Artifacts - uses: actions/github-script@v3 + - name: Download and Extract Artifacts + uses: dawidd6/action-download-artifact@v3 with: - script: | - var fs = require('fs'); - var path = require('path'); - var artifacts_path = path.join('${{github.workspace}}', 'artifacts') - fs.mkdirSync(artifacts_path, { recursive: true }) - - var artifacts = await github.actions.listWorkflowRunArtifacts({ - owner: context.repo.owner, - repo: context.repo.repo, - run_id: ${{ github.event.workflow_run.id }}, - }); + run_id: ${{ inputs.run_id }} + path: artifacts - for (const artifact of artifacts.data.artifacts) { - var download = await github.actions.downloadArtifact({ - owner: context.repo.owner, - repo: context.repo.repo, - artifact_id: artifact.id, - archive_format: 'zip', - }); - var artifact_path = path.join(artifacts_path, `${artifact.name}.zip`) - fs.writeFileSync(artifact_path, Buffer.from(download.data)); - console.log(`Downloaded ${artifact_path}`); - } - - name: Extract Artifacts - run: | - for file in artifacts/*.zip - do - if [ -f "$file" ] - then - dir="${file/%.zip/}" - mkdir -p "$dir" - unzip -d "$dir" "$file" - fi - done - name: Publish UT Results uses: EnricoMi/publish-unit-test-result-action@v2 with: @@ -62,3 +44,6 @@ jobs: files: ${{ inputs.files }} check_name: ${{ inputs.check_name }} comment_title: ${{ inputs.comment_title }} + event_name: ${{ inputs.event_name }} + # u should upload the event file with upload name 'event-file' + event_file: artifacts/event-file/event.json diff --git a/.github/workflows/publish-test-results-action.yml b/.github/workflows/publish-test-results-action.yml index 5b13c8b9c68..841929c1cb1 100644 --- a/.github/workflows/publish-test-results-action.yml +++ b/.github/workflows/publish-test-results-action.yml @@ -11,6 +11,10 @@ on: required: true type: string +permissions: + checks: write + pull-requests: write + jobs: publish-test-results: runs-on: ubuntu-latest diff --git a/.github/workflows/sdk.yml b/.github/workflows/sdk.yml index 7fd0a6f1cdd..b06c39d48ab 100644 --- a/.github/workflows/sdk.yml +++ b/.github/workflows/sdk.yml @@ -100,6 +100,13 @@ jobs: run: | ./mvnw --batch-mode test + - name: Upload Event File + if: always() + uses: actions/upload-artifact@v2 + with: + name: event-file + path: ${{ github.event_path }} + - name: upload java ut results if: always() uses: actions/upload-artifact@v2 @@ -126,13 +133,14 @@ jobs: ./mvnw --batch-mode prepare-package ./mvnw --batch-mode scoverage:report - - name: upload maven coverage - uses: codecov/codecov-action@v3 + - name: upload coverage + uses: actions/upload-artifact@v3 with: - files: java/**/target/site/jacoco/jacoco.xml,java/**/target/scoverage.xml - name: coverage-java - fail_ci_if_error: true - verbose: true + name: coverage-java-report-${{ github.sha }} + path: | + java/**/target/site/jacoco/jacoco.xml + java/**/target/scoverage.xml + retention-days: 3 - name: stop services run: | @@ -276,13 +284,14 @@ jobs: python/openmldb_sdk/tests/pytest.xml python/openmldb_tool/tests/pytest.xml - - name: upload python coverage to codecov - uses: codecov/codecov-action@v3 + - name: upload coverage + uses: actions/upload-artifact@v3 with: - name: coverage-python - files: python/openmldb_sdk/tests/coverage.xml,python/openmldb_tool/tests/coverage.xml - fail_ci_if_error: true - verbose: true + name: coverage-python-report-${{ github.sha }} + path: | + python/openmldb_sdk/tests/coverage.xml + python/openmldb_tool/tests/coverage.xml + retention-days: 3 - name: upload to pypi if: > @@ -381,16 +390,12 @@ jobs: working-directory: go run: go test ./... -race -covermode=atomic -coverprofile=coverage.out - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v3 + - name: upload coverage + uses: actions/upload-artifact@v3 with: - name: coverage-go - files: go/coverage.out - fail_ci_if_error: true - verbose: true - - - name: stop server - run: ./openmldb/sbin/stop-all.sh + name: coverage-go-report-${{ github.sha }} + path: go/coverage.out + retention-days: 3 publish-test-results: needs: ["java-sdk", "python-sdk", "go-sdk"] @@ -407,3 +412,37 @@ jobs: mac-ut-result-*/**/*.xml check_name: SDK Test Report comment_title: SDK Test Report + + publish-coverage-results: + needs: ["java-sdk", "python-sdk", "go-sdk"] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + submodules: true + + - name: Download Artifacts (java) + uses: actions/download-artifact@v3 + with: + name: coverage-java-report-${{ github.sha }} + path: java + + - name: Download Artifacts (python) + uses: actions/download-artifact@v3 + with: + name: coverage-python-report-${{ github.sha }} + path: python + + - name: Download Artifacts (go) + uses: actions/download-artifact@v3 + with: + name: coverage-go-report-${{ github.sha }} + path: go + + - name: Upload Coverage Report + uses: codecov/codecov-action@v3 + with: + files: go/coverage.out,python/openmldb_sdk/tests/coverage.xml,python/openmldb_tool/tests/coverage.xml,java/**/target/site/jacoco/jacoco.xml,java/**/target/scoverage.xml + name: coverage-sdk + fail_ci_if_error: true + verbose: true diff --git a/.github/workflows/selfhost_intergration.yml b/.github/workflows/selfhost_intergration.yml index dcaf34cce8d..e1695f20b4f 100644 --- a/.github/workflows/selfhost_intergration.yml +++ b/.github/workflows/selfhost_intergration.yml @@ -38,6 +38,9 @@ jobs: make SQL_JAVASDK_ENABLE=ON && make SQL_JAVASDK_ENABLE=ON install mv openmldb-linux openmldb-main-linux tar -zcf openmldb-linux.tar.gz openmldb-main-linux + mv java/openmldb-batch/target/openmldb*-SNAPSHOT.jar openmldb-batch.jar + mv java/openmldb-jdbc/target/openmldb*-SNAPSHOT.jar openmldb-jdbc.jar + mv java/openmldb-native/target/openmldb*-SNAPSHOT.jar openmldb-native.jar - name: download if: ${{ env.E_VERSION != 'main' }} run: | @@ -48,9 +51,64 @@ jobs: uses: actions/upload-artifact@v3 with: name: openmldb-package - path: openmldb-linux.tar.gz + path: | + openmldb-batch.jar + openmldb-jdbc.jar + openmldb-native.jar + openmldb-linux.tar.gz - + apiserver-cluster: + needs: build-openmldb + + runs-on: [self-hosted,common-user] + steps: + - uses: actions/checkout@v3 + - name: before test + if: ${{ env.ETYPE == 'all' || env.ETYPE == 'java' }} + run: mkdir ${{ github.job }} + - name: download artifact + uses: actions/download-artifact@v3 + with: + name: openmldb-package + - name: install openmldb + run: | + tar -zxf openmldb-linux.tar.gz -C ${{ github.job }}/ + bash test/steps/format_config.sh $(pwd)/${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux ${{ github.job }} 20001 21000 java + bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/deploy-all.sh + bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/start-all.sh + - name: test + run: bash test/steps/openmldb-integration-test.sh -j 0.0.0 -d deploy -m apiserver + - name: stop openmldb + if: always() + run: bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/stop-all.sh + - name: remove openmldb + if: always() + run: bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/clear-all.sh + - name: TEST Results + if: always() + uses: EnricoMi/publish-unit-test-result-action@v1 + with: + files: test/integration-test/openmldb-test-java/openmldb-sdk-test/target/surefire-reports/TEST-*.xml + check_name: "SRC apiserver-cluster Report" + comment_title: "SRC apiserver-cluster Report" + - name: tar test report + if: ${{ failure() }} + run: tar -zcvf surefire-reports.tar.gz test/integration-test/openmldb-test-java/openmldb-sdk-test/target/surefire-reports + - name: Send Email + if: ${{ failure() }} + uses: dawidd6/action-send-mail@master + with: + server_address: mail.4paradigm.com + server_port: 995 + username: ${{ secrets.MAIL_USERNAME }} + password: ${{ secrets.MAIL_PASSWORD }} + secure: true + subject: OpenMLDB Memory Test + body: OpenMLDB Memory Test Failed + html_body: test/integration-test/openmldb-test-java/openmldb-sdk-test/target/surefire-reports/html/overview.html + to: ${{ secrets.MAIL_TO }} + from: GitHub Actions + attachments: surefire-reports.tar.gz java-sdk-cluster-memory-0: needs: build-openmldb @@ -72,16 +130,12 @@ jobs: bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/deploy-all.sh bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/start-all.sh - name: test - run: | - mkdir mvnrepo - export MAVEN_OPTS="-Dmaven.repo.local=$(pwd)/mvnrepo" - echo $MAVEN_OPTS - bash test/steps/openmldb-sdk-test-java-src.sh -c test_cluster.xml -d deploy -l "0" -s "memory" + run: bash test/steps/openmldb-integration-test.sh -j 0.0.0 -c test_cluster.xml -d deploy -l "0" -s "memory" - name: stop openmldb if: always() run: bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/stop-all.sh - name: remove openmldb - if: success() + if: always() run: bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/clear-all.sh - name: TEST Results if: always() @@ -97,10 +151,11 @@ jobs: if: ${{ failure() }} uses: dawidd6/action-send-mail@master with: - server_address: smtp.partner.outlook.cn - server_port: 587 + server_address: mail.4paradigm.com + server_port: 995 username: ${{ secrets.MAIL_USERNAME }} password: ${{ secrets.MAIL_PASSWORD }} + secure: true subject: OpenMLDB Memory Test body: OpenMLDB Memory Test Failed html_body: test/integration-test/openmldb-test-java/openmldb-sdk-test/target/surefire-reports/html/overview.html @@ -129,16 +184,12 @@ jobs: bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/deploy-all.sh bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/start-all.sh - name: test - run: | - mkdir mvnrepo - export MAVEN_OPTS="-Dmaven.repo.local=$(pwd)/mvnrepo" - echo $MAVEN_OPTS - bash test/steps/openmldb-sdk-test-java-src.sh -c test_cluster.xml -d deploy -l "1,2,3,4,5" -s "memory" + run: bash test/steps/openmldb-integration-test.sh -j 0.0.0 -c test_cluster.xml -d deploy -l "1,2,3,4,5" -s "memory" - name: stop openmldb if: always() run: bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/stop-all.sh - name: remove openmldb - if: success() + if: always() run: bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/clear-all.sh - name: TEST Results if: always() @@ -154,10 +205,11 @@ jobs: if: ${{ failure() }} uses: dawidd6/action-send-mail@master with: - server_address: smtp.partner.outlook.cn - server_port: 587 + server_address: mail.4paradigm.com + server_port: 995 username: ${{ secrets.MAIL_USERNAME }} password: ${{ secrets.MAIL_PASSWORD }} + secure: true subject: OpenMLDB Memory Test body: OpenMLDB Memory Test Failed html_body: test/integration-test/openmldb-test-java/openmldb-sdk-test/target/surefire-reports/html/overview.html @@ -187,16 +239,12 @@ jobs: bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/deploy-all.sh bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/start-all.sh - name: test - run: | - mkdir mvnrepo - export MAVEN_OPTS="-Dmaven.repo.local=$(pwd)/mvnrepo" - echo $MAVEN_OPTS - bash test/steps/openmldb-sdk-test-java-src.sh -c test_cluster_disk.xml -d deploy -l "0" -s "hdd" + run: bash test/steps/openmldb-integration-test.sh -j 0.0.0 -c test_cluster_disk.xml -d deploy -l "0" -s "hdd" - name: stop openmldb if: always() run: bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/stop-all.sh - name: remove openmldb - if: success() + if: always() run: bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/clear-all.sh - name: TEST Results if: always() @@ -212,10 +260,11 @@ jobs: if: ${{ failure() }} uses: dawidd6/action-send-mail@master with: - server_address: smtp.partner.outlook.cn - server_port: 587 + server_address: mail.4paradigm.com + server_port: 995 username: ${{ secrets.MAIL_USERNAME }} password: ${{ secrets.MAIL_PASSWORD }} + secure: true subject: OpenMLDB HDD Test body: OpenMLDB HDD Test Failed html_body: test/integration-test/openmldb-test-java/openmldb-sdk-test/target/surefire-reports/html/overview.html @@ -240,20 +289,16 @@ jobs: - name: install openmldb run: | tar -zxf openmldb-linux.tar.gz -C ${{ github.job }}/ - bash test/steps/format_config.sh $(pwd)/${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux java-sdk-cluster-memory-0 23001 24000 java ssd + bash test/steps/format_config.sh $(pwd)/${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux ${{ github.job }} 23001 24000 java ssd bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/deploy-all.sh bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/start-all.sh - name: test - run: | - mkdir mvnrepo - export MAVEN_OPTS="-Dmaven.repo.local=$(pwd)/mvnrepo" - echo $MAVEN_OPTS - bash test/steps/openmldb-sdk-test-java-src.sh -c test_cluster_disk.xml -d deploy -l "0" -s "ssd" + run: bash test/steps/openmldb-integration-test.sh -j 0.0.0 -c test_cluster_disk.xml -d deploy -l "0" -s "ssd" - name: stop openmldb if: always() run: bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/stop-all.sh - name: remove openmldb - if: success() + if: always() run: bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/clear-all.sh - name: TEST Results if: always() @@ -269,10 +314,11 @@ jobs: if: ${{ failure() }} uses: dawidd6/action-send-mail@master with: - server_address: smtp.partner.outlook.cn - server_port: 587 + server_address: mail.4paradigm.com + server_port: 995 username: ${{ secrets.MAIL_USERNAME }} password: ${{ secrets.MAIL_PASSWORD }} + secure: true subject: OpenMLDB SSD Test body: OpenMLDB SSD Test Failed html_body: test/integration-test/openmldb-test-java/openmldb-sdk-test/target/surefire-reports/html/overview.html @@ -299,19 +345,15 @@ jobs: bash test/steps/format_config.sh $(pwd)/${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux ${{ github.job }} 24001 25000 java hadoop bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/deploy-all.sh bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/start-all.sh - bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/stop-taskmanager.sh - bash HADOOP_CONF_DIR=/mnt/hdd0/denglong/openmldb_runner_work/hadoop ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/stop-taskmanager.sh + bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/stop-taskmanagers.sh + HADOOP_CONF_DIR=/mnt/hdd0/denglong/openmldb_runner_work/hadoop ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/stop-taskmanagers.sh - name: test - run: | - mkdir mvnrepo - export MAVEN_OPTS="-Dmaven.repo.local=$(pwd)/mvnrepo" - echo $MAVEN_OPTS - bash test/steps/openmldb-sdk-test-java-src.sh -c test_yarn.xml -d deploy -l "0" -s "memory" + run: bash test/steps/openmldb-integration-test.sh -j 0.0.0 -c test_yarn.xml -d deploy -l "0" -s "memory" - name: stop openmldb if: always() run: bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/stop-all.sh - name: remove openmldb - if: success() + if: always() run: bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/clear-all.sh - name: TEST Results if: always() @@ -327,10 +369,11 @@ jobs: if: ${{ failure() }} uses: dawidd6/action-send-mail@master with: - server_address: smtp.partner.outlook.cn - server_port: 587 + server_address: mail.4paradigm.com + server_port: 995 username: ${{ secrets.MAIL_USERNAME }} password: ${{ secrets.MAIL_PASSWORD }} + secure: true subject: OpenMLDB yarn Test body: OpenMLDB yarn Test Failed html_body: test/integration-test/openmldb-test-java/openmldb-sdk-test/target/surefire-reports/html/overview.html @@ -359,15 +402,12 @@ jobs: bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/deploy-all.sh bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/start-all.sh - name: test - run: | - mkdir mvnrepo - export MAVEN_OPTS="-Dmaven.repo.local=$(pwd)/mvnrepo" - echo $MAVEN_OPTS + run: echo "a" - name: stop openmldb if: always() run: bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/stop-all.sh - name: remove openmldb - if: success() + if: always() run: bash ${{ github.job }}/openmldb-${{ env.E_VERSION }}-linux/sbin/clear-all.sh # - name: TEST Results # if: always() @@ -383,8 +423,9 @@ jobs: # if: ${{ failure() }} # uses: dawidd6/action-send-mail@master # with: - # server_address: smtp.partner.outlook.cn - # server_port: 587 + # server_address: mail.4paradigm.com + # server_port: 995 + # secure: true # username: ${{ secrets.MAIL_USERNAME }} # password: ${{ secrets.MAIL_PASSWORD }} # subject: OpenMLDB yarn Test diff --git a/cases/function/join/test_lastjoin_simple.yaml b/cases/function/join/test_lastjoin_simple.yaml index 4d23b312ef2..589e98bd05b 100644 --- a/cases/function/join/test_lastjoin_simple.yaml +++ b/cases/function/join/test_lastjoin_simple.yaml @@ -1067,4 +1067,4 @@ cases: rows: - [ "aa",2,131,1590738990000 ] - [ "bb",21,NULL,NULL ] - - [ "dd", 41, NULL, NULL ] \ No newline at end of file + - [ "dd", 41, NULL, NULL ] diff --git a/cases/integration_test/cluster/window_and_lastjoin.yaml b/cases/integration_test/cluster/window_and_lastjoin.yaml index 1d629fbe1b1..45cc9599e78 100644 --- a/cases/integration_test/cluster/window_and_lastjoin.yaml +++ b/cases/integration_test/cluster/window_and_lastjoin.yaml @@ -217,16 +217,16 @@ cases: columns : ["id int", "c1 string", "c2 string", "c3 string", "c4 string", "c6 double", "c7 timestamp"] indexs: ["index1:c1:c7", "index2:c2:c7", "index3:c3:c7", "index4:c4:c7"] rows: - - [ 1, "a", "aa", "aaa", "aaaa", "1.0", 1590738990000] - - [ 2, "a", "aa", "aaa", "aaaa", "1.0", 1590738991000] - - [ 3, "a", "aa", "aaa", "bbbb", "1.0", 1590738992000] - - [ 4, "a", "aa", "aaa", "bbbb", "1.0", 1590738993000] - - [ 5, "a", "aa", "bbb", "bbbb", "1.0", 1590738994000] - - [ 6, "a", "aa", "bbb", "bbbb", "1.0", 1590738995000] - - [ 7, "a", "bb", "bbb", "bbbb", "1.0", 1590738996000 ] - - [ 8, "a", "bb", "bbb", "bbbb", "1.0", 1590738997000 ] - - [ 9, "b", "bb", "bbb", "bbbb", "1.0", 1590739998000 ] - - [10, "b", "bb", "bbb", "bbbb", "1.0", 1590739999000 ] + - [ 1, "a", "aa", "aaa", "aaaa", 1.0, 1590738990000] + - [ 2, "a", "aa", "aaa", "aaaa", 1.0, 1590738991000] + - [ 3, "a", "aa", "aaa", "bbbb", 1.0, 1590738992000] + - [ 4, "a", "aa", "aaa", "bbbb", 1.0, 1590738993000] + - [ 5, "a", "aa", "bbb", "bbbb", 1.0, 1590738994000] + - [ 6, "a", "aa", "bbb", "bbbb", 1.0, 1590738995000] + - [ 7, "a", "bb", "bbb", "bbbb", 1.0, 1590738996000 ] + - [ 8, "a", "bb", "bbb", "bbbb", 1.0, 1590738997000 ] + - [ 9, "b", "bb", "bbb", "bbbb", 1.0, 1590739998000 ] + - [10, "b", "bb", "bbb", "bbbb", 1.0, 1590739999000 ] sql: | select * from ( @@ -270,16 +270,16 @@ cases: columns : ["id int", "c1 string", "c2 string", "c3 string", "c4 string", "c6 double", "c7 timestamp"] indexs: ["index1:c1:c7", "index2:c2:c7", "index3:c3:c7", "index4:c4:c7"] rows: - - [ 1, "a", "aa", "aaa", "aaaa", "1.0", 1590738990000] - - [ 2, "a", "aa", "aaa", "aaaa", "1.0", 1590738991000] - - [ 3, "a", "aa", "aaa", "bbbb", "1.0", 1590738992000] - - [ 4, "a", "aa", "aaa", "bbbb", "1.0", 1590738993000] - - [ 5, "a", "aa", "bbb", "bbbb", "1.0", 1590738994000] - - [ 6, "a", "aa", "bbb", "bbbb", "1.0", 1590738995000] - - [ 7, "a", "bb", "bbb", "bbbb", "1.0", 1590738996000 ] - - [ 8, "a", "bb", "bbb", "bbbb", "1.0", 1590738997000 ] - - [ 9, "b", "bb", "bbb", "bbbb", "1.0", 1590739998000 ] - - [10, "b", "bb", "bbb", "bbbb", "1.0", 1590739999000 ] + - [ 1, "a", "aa", "aaa", "aaaa", 1.0, 1590738990000] + - [ 2, "a", "aa", "aaa", "aaaa", 1.0, 1590738991000] + - [ 3, "a", "aa", "aaa", "bbbb", 1.0, 1590738992000] + - [ 4, "a", "aa", "aaa", "bbbb", 1.0, 1590738993000] + - [ 5, "a", "aa", "bbb", "bbbb", 1.0, 1590738994000] + - [ 6, "a", "aa", "bbb", "bbbb", 1.0, 1590738995000] + - [ 7, "a", "bb", "bbb", "bbbb", 1.0, 1590738996000 ] + - [ 8, "a", "bb", "bbb", "bbbb", 1.0, 1590738997000 ] + - [ 9, "b", "bb", "bbb", "bbbb", 1.0, 1590739998000 ] + - [10, "b", "bb", "bbb", "bbbb", 1.0, 1590739999000 ] sql: | select * from ( @@ -299,7 +299,7 @@ cases: window w4 as (PARTITION BY {0}.c4 ORDER BY {0}.c7 ROWS_RANGE BETWEEN 10d PRECEDING AND CURRENT ROW) ) as out4 on out1_id=out4_id; request_plan: | - SIMPLE_PROJECT(sources=(out1_id, c1, w1_sum_c6, out2_id, c2, w2_sum_c6, out3_id, c3, w3_sum_c6, out4.out4_id, out4.c4, out4.w4_sum_c6)) + SIMPLE_PROJECT(sources=(out1.out1_id, out1.c1, out1.w1_sum_c6, out2.out2_id, out2.c2, out2.w2_sum_c6, out3.out3_id, out3.c3, out3.w3_sum_c6, out4.out4_id, out4.c4, out4.w4_sum_c6)) REQUEST_JOIN(type=LastJoin, condition=, left_keys=(out1_id), right_keys=(out4_id), index_keys=) REQUEST_JOIN(type=LastJoin, condition=, left_keys=(out1_id), right_keys=(out3_id), index_keys=) REQUEST_JOIN(type=LastJoin, condition=, left_keys=(out1_id), right_keys=(out2_id), index_keys=) @@ -325,7 +325,7 @@ cases: DATA_PROVIDER(type=Partition, table=auto_t0, index=index4) cluster_request_plan: | - SIMPLE_PROJECT(sources=(out1_id, c1, w1_sum_c6, out2_id, c2, w2_sum_c6, out3_id, c3, w3_sum_c6, out4.out4_id, out4.c4, out4.w4_sum_c6)) + SIMPLE_PROJECT(sources=(out1.out1_id, out1.c1, out1.w1_sum_c6, out2.out2_id, out2.c2, out2.w2_sum_c6, out3.out3_id, out3.c3, out3.w3_sum_c6, out4.out4_id, out4.c4, out4.w4_sum_c6)) REQUEST_JOIN(type=LastJoin, condition=, left_keys=(out1_id), right_keys=(out4_id), index_keys=) REQUEST_JOIN(type=LastJoin, condition=, left_keys=(out1_id), right_keys=(out3_id), index_keys=) REQUEST_JOIN(type=LastJoin, condition=, left_keys=(out1_id), right_keys=(out2_id), index_keys=) @@ -391,29 +391,29 @@ cases: columns : ["id int", "c1 string", "c2 string", "c3 string", "c4 string", "c6 double", "c7 timestamp"] indexs: ["index1:c1:c7"] rows: - - [ 1, "a", "aa", "aaa", "aaaa", "1.0", 1590738990000] - - [ 2, "a", "aa", "aaa", "aaaa", "2.0", 1590738991000] - - [ 3, "a", "aa", "aaa", "bbbb", "3.0", 1590738992000] - - [ 4, "a", "aa", "aaa", "bbbb", "4.0", 1590738993000] - - [ 5, "a", "aa", "bbb", "bbbb", "5.0", 1590738994000] - - [ 6, "a", "aa", "bbb", "bbbb", "6.0", 1590738995000] - - [ 7, "a", "bb", "bbb", "bbbb", "7.0", 1590738996000 ] - - [ 8, "a", "bb", "bbb", "bbbb", "8.0", 1590738997000 ] - - [ 9, "b", "bb", "bbb", "bbbb", "9.0", 1590738998000 ] - - [10, "b", "bb", "bbb", "bbbb", "10.0", 1590738999000 ] + - [ 1, "a", "aa", "aaa", "aaaa", 1.0, 1590738990000] + - [ 2, "a", "aa", "aaa", "aaaa", 2.0, 1590738991000] + - [ 3, "a", "aa", "aaa", "bbbb", 3.0, 1590738992000] + - [ 4, "a", "aa", "aaa", "bbbb", 4.0, 1590738993000] + - [ 5, "a", "aa", "bbb", "bbbb", 5.0, 1590738994000] + - [ 6, "a", "aa", "bbb", "bbbb", 6.0, 1590738995000] + - [ 7, "a", "bb", "bbb", "bbbb", 7.0, 1590738996000 ] + - [ 8, "a", "bb", "bbb", "bbbb", 8.0, 1590738997000 ] + - [ 9, "b", "bb", "bbb", "bbbb", 9.0, 1590738998000 ] + - [10, "b", "bb", "bbb", "bbbb", 10.0, 1590738999000 ] - columns: ["rid int", "x1 string", "x2 string", "x3 string", "x4 string", "x6 double", "x7 timestamp"] indexs: ["index1:x1:x7", "index2:x2:x7", "index3:x3:x7", "index4:x4:x7", ] rows: - - [ 1, "a", "aa", "aaa", "aaaa", "1.0", 1590738990000 ] - - [ 2, "a", "aa", "aaa", "aaaa", "1.0", 1590738991000 ] - - [ 3, "a", "aa", "aaa", "bbbb", "1.0", 1590738992000 ] - - [ 4, "a", "aa", "aaa", "bbbb", "1.0", 1590738993000 ] - - [ 5, "a", "aa", "bbb", "bbbb", "1.0", 1590738994000 ] - - [ 6, "a", "aa", "bbb", "bbbb", "1.0", 1590738995000 ] - - [ 7, "a", "bb", "bbb", "bbbb", "1.0", 1590738996000 ] - - [ 8, "a", "bb", "bbb", "bbbb", "1.0", 1590738997000 ] - - [ 9, "b", "bb", "bbb", "bbbb", "1.0", 1590738998000 ] - - [ 10, "b", "bb", "bbb", "bbbb", "1.0",1590738999000 ] + - [ 1, "a", "aa", "aaa", "aaaa", 1.0, 1590738990000 ] + - [ 2, "a", "aa", "aaa", "aaaa", 1.0, 1590738991000 ] + - [ 3, "a", "aa", "aaa", "bbbb", 1.0, 1590738992000 ] + - [ 4, "a", "aa", "aaa", "bbbb", 1.0, 1590738993000 ] + - [ 5, "a", "aa", "bbb", "bbbb", 1.0, 1590738994000 ] + - [ 6, "a", "aa", "bbb", "bbbb", 1.0, 1590738995000 ] + - [ 7, "a", "bb", "bbb", "bbbb", 1.0, 1590738996000 ] + - [ 8, "a", "bb", "bbb", "bbbb", 1.0, 1590738997000 ] + - [ 9, "b", "bb", "bbb", "bbbb", 1.0, 1590738998000 ] + - [ 10, "b", "bb", "bbb", "bbbb", 1.0,1590738999000 ] sql: | select id, c1, c2, c3, c4, c6, c7, cur_hour, today , w1_sum_c6, w1_max_c6, w1_min_c6, w1_avg_c6, w1_cnt_c6 @@ -493,16 +493,16 @@ cases: "t1_rid int32", "t2_rid int32", "t3_rid int32", "t4_rid int32"] order: id rows: - - [ 1, "a", "aa", "aaa", "aaaa", "1.0", 1590738990000, 15, 29, 1.0, 1.0, 1.0, 1.0, 1, NULL, NULL, NULL, NULL] - - [ 2, "a", "aa", "aaa", "aaaa", "2.0", 1590738991000, 15, 29, 3.0, 2.0, 1.0, 1.5, 2, 1, NULL, NULL, NULL ] - - [ 3, "a", "aa", "aaa", "bbbb", "3.0", 1590738992000, 15, 29, 6.0, 3.0, 1.0, 2.0, 3, 2 , 1, NULL, NULL] - - [ 4, "a", "aa", "aaa", "bbbb", "4.0", 1590738993000, 15, 29, 10.0, 4.0, 1.0, 2.5, 4, 3 , 2, 1, NULL] - - [ 5, "a", "aa", "bbb", "bbbb", "5.0", 1590738994000, 15, 29, 15.0, 5.0, 1.0, 3.0, 5, 4 , 3, NULL, NULL] - - [ 6, "a", "aa", "bbb", "bbbb", "6.0", 1590738995000, 15, 29, 21.0, 6.0, 1.0, 3.5, 6, 5 , 4, NULL, NULL] - - [ 7, "a", "bb", "bbb", "bbbb", "7.0", 1590738996000, 15, 29, 28.0, 7.0, 1.0, 4.0, 7, 6 , NULL, NULL, 3] - - [ 8, "a", "bb", "bbb", "bbbb", "8.0", 1590738997000, 15, 29, 36.0, 8.0, 1.0, 4.5, 8, 7 , NULL, 5, 4] - - [ 9, "b", "bb", "bbb", "bbbb", "9.0", 1590738998000, 15, 29, 9.0, 9.0, 9.0, 9.0, 1, NULL , 7, 6, 5] - - [ 10, "b", "bb", "bbb", "bbbb", "10.0",1590738999000,15, 29, 19.0, 10.0, 9.0, 9.5, 2, 9, 8, 7, 6] + - [ 1, "a", "aa", "aaa", "aaaa", 1.0, 1590738990000, 15, 29, 1.0, 1.0, 1.0, 1.0, 1, NULL, NULL, NULL, NULL] + - [ 2, "a", "aa", "aaa", "aaaa", 2.0, 1590738991000, 15, 29, 3.0, 2.0, 1.0, 1.5, 2, 1, NULL, NULL, NULL ] + - [ 3, "a", "aa", "aaa", "bbbb", 3.0, 1590738992000, 15, 29, 6.0, 3.0, 1.0, 2.0, 3, 2 , 1, NULL, NULL] + - [ 4, "a", "aa", "aaa", "bbbb", 4.0, 1590738993000, 15, 29, 10.0, 4.0, 1.0, 2.5, 4, 3 , 2, 1, NULL] + - [ 5, "a", "aa", "bbb", "bbbb", 5.0, 1590738994000, 15, 29, 15.0, 5.0, 1.0, 3.0, 5, 4 , 3, NULL, NULL] + - [ 6, "a", "aa", "bbb", "bbbb", 6.0, 1590738995000, 15, 29, 21.0, 6.0, 1.0, 3.5, 6, 5 , 4, NULL, NULL] + - [ 7, "a", "bb", "bbb", "bbbb", 7.0, 1590738996000, 15, 29, 28.0, 7.0, 1.0, 4.0, 7, 6 , NULL, NULL, 3] + - [ 8, "a", "bb", "bbb", "bbbb", 8.0, 1590738997000, 15, 29, 36.0, 8.0, 1.0, 4.5, 8, 7 , NULL, 5, 4] + - [ 9, "b", "bb", "bbb", "bbbb", 9.0, 1590738998000, 15, 29, 9.0, 9.0, 9.0, 9.0, 1, NULL , 7, 6, 5] + - [ 10, "b", "bb", "bbb", "bbbb", 10.0,1590738999000,15, 29, 19.0, 10.0, 9.0, 9.5, 2, 9, 8, 7, 6] - @@ -514,29 +514,29 @@ cases: columns : ["id int", "c1 string", "c2 string", "c3 string", "c4 string", "c6 double", "c7 timestamp"] indexs: ["index1:c1:c7"] rows: - - [ 1, "a", "aa", "aaa", "aaaa", "1.0", 1590738990000] - - [ 2, "a", "aa", "aaa", "aaaa", "2.0", 1590738991000] - - [ 3, "a", "aa", "aaa", "bbbb", "3.0", 1590738992000] - - [ 4, "a", "aa", "aaa", "bbbb", "4.0", 1590738993000] - - [ 5, "a", "aa", "bbb", "bbbb", "5.0", 1590738994000] - - [ 6, "a", "aa", "bbb", "bbbb", "6.0", 1590738995000] - - [ 7, "a", "bb", "bbb", "bbbb", "7.0", 1590738996000 ] - - [ 8, "a", "bb", "bbb", "bbbb", "8.0", 1590738997000 ] - - [ 9, "b", "bb", "bbb", "bbbb", "9.0", 1590738998000 ] - - [10, "b", "bb", "bbb", "bbbb", "10.0", 1590738999000 ] + - [ 1, "a", "aa", "aaa", "aaaa", 1.0, 1590738990000] + - [ 2, "a", "aa", "aaa", "aaaa", 2.0, 1590738991000] + - [ 3, "a", "aa", "aaa", "bbbb", 3.0, 1590738992000] + - [ 4, "a", "aa", "aaa", "bbbb", 4.0, 1590738993000] + - [ 5, "a", "aa", "bbb", "bbbb", 5.0, 1590738994000] + - [ 6, "a", "aa", "bbb", "bbbb", 6.0, 1590738995000] + - [ 7, "a", "bb", "bbb", "bbbb", 7.0, 1590738996000 ] + - [ 8, "a", "bb", "bbb", "bbbb", 8.0, 1590738997000 ] + - [ 9, "b", "bb", "bbb", "bbbb", 9.0, 1590738998000 ] + - [10, "b", "bb", "bbb", "bbbb", 10.0, 1590738999000 ] - columns: ["rid int", "x1 string", "x2 string", "x3 string", "x4 string", "x6 double", "x7 timestamp"] indexs: ["index1:x1:x7", "index2:x2:x7", "index3:x3:x7", "index4:x4:x7", ] rows: - - [ 1, "a", "aa", "aaa", "aaaa", "1.0", 1590738990000 ] - - [ 2, "a", "aa", "aaa", "aaaa", "1.0", 1590738991000 ] - - [ 3, "a", "aa", "aaa", "bbbb", "1.0", 1590738992000 ] - - [ 4, "a", "aa", "aaa", "bbbb", "1.0", 1590738993000 ] - - [ 5, "a", "aa", "bbb", "bbbb", "1.0", 1590738994000 ] - - [ 6, "a", "aa", "bbb", "bbbb", "1.0", 1590738995000 ] - - [ 7, "a", "bb", "bbb", "bbbb", "1.0", 1590738996000 ] - - [ 8, "a", "bb", "bbb", "bbbb", "1.0", 1590738997000 ] - - [ 9, "b", "bb", "bbb", "bbbb", "1.0", 1590738998000 ] - - [ 10, "b", "bb", "bbb", "bbbb", "1.0",1590738999000 ] + - [ 1, "a", "aa", "aaa", "aaaa", 1.0, 1590738990000 ] + - [ 2, "a", "aa", "aaa", "aaaa", 1.0, 1590738991000 ] + - [ 3, "a", "aa", "aaa", "bbbb", 1.0, 1590738992000 ] + - [ 4, "a", "aa", "aaa", "bbbb", 1.0, 1590738993000 ] + - [ 5, "a", "aa", "bbb", "bbbb", 1.0, 1590738994000 ] + - [ 6, "a", "aa", "bbb", "bbbb", 1.0, 1590738995000 ] + - [ 7, "a", "bb", "bbb", "bbbb", 1.0, 1590738996000 ] + - [ 8, "a", "bb", "bbb", "bbbb", 1.0, 1590738997000 ] + - [ 9, "b", "bb", "bbb", "bbbb", 1.0, 1590738998000 ] + - [ 10, "b", "bb", "bbb", "bbbb", 1.0,1590738999000 ] sql: | select id, c1, c2, c3, c4, c6, c7, cur_hour, today , w1_sum_c6, w1_max_c6, w1_min_c6, w1_avg_c6, w1_cnt_c6 @@ -616,13 +616,13 @@ cases: "t1_rid int32", "t2_rid int32", "t3_rid int32", "t4_rid int32"] order: id rows: - - [ 1, "a", "aa", "aaa", "aaaa", "1.0", 1590738990000, 15, 29, 1.0, 1.0, 1.0, 1.0, 1, NULL, NULL, NULL, NULL] - - [ 2, "a", "aa", "aaa", "aaaa", "2.0", 1590738991000, 15, 29, 3.0, 2.0, 1.0, 1.5, 2, 1, NULL, NULL, NULL ] - - [ 3, "a", "aa", "aaa", "bbbb", "3.0", 1590738992000, 15, 29, 6.0, 3.0, 1.0, 2.0, 3, 2 , 1, NULL, NULL] - - [ 4, "a", "aa", "aaa", "bbbb", "4.0", 1590738993000, 15, 29, 10.0, 4.0, 1.0, 2.5, 4, 3 , 2, 1, NULL] - - [ 5, "a", "aa", "bbb", "bbbb", "5.0", 1590738994000, 15, 29, 15.0, 5.0, 1.0, 3.0, 5, 4 , 3, NULL, NULL] - - [ 6, "a", "aa", "bbb", "bbbb", "6.0", 1590738995000, 15, 29, 21.0, 6.0, 1.0, 3.5, 6, 5 , 4, NULL, NULL] - - [ 7, "a", "bb", "bbb", "bbbb", "7.0", 1590738996000, 15, 29, 28.0, 7.0, 1.0, 4.0, 7, 6 , NULL, NULL, 3] - - [ 8, "a", "bb", "bbb", "bbbb", "8.0", 1590738997000, 15, 29, 36.0, 8.0, 1.0, 4.5, 8, 7 , NULL, 5, 4] - - [ 9, "b", "bb", "bbb", "bbbb", "9.0", 1590738998000, 15, 29, 9.0, 9.0, 9.0, 9.0, 1, NULL , 7, 6, 5] - - [ 10, "b", "bb", "bbb", "bbbb", "10.0",1590738999000,15, 29, 19.0, 10.0, 9.0, 9.5, 2, 9, 8, 7, 6] + - [ 1, "a", "aa", "aaa", "aaaa", 1.0, 1590738990000, 15, 29, 1.0, 1.0, 1.0, 1.0, 1, NULL, NULL, NULL, NULL] + - [ 2, "a", "aa", "aaa", "aaaa", 2.0, 1590738991000, 15, 29, 3.0, 2.0, 1.0, 1.5, 2, 1, NULL, NULL, NULL ] + - [ 3, "a", "aa", "aaa", "bbbb", 3.0, 1590738992000, 15, 29, 6.0, 3.0, 1.0, 2.0, 3, 2 , 1, NULL, NULL] + - [ 4, "a", "aa", "aaa", "bbbb", 4.0, 1590738993000, 15, 29, 10.0, 4.0, 1.0, 2.5, 4, 3 , 2, 1, NULL] + - [ 5, "a", "aa", "bbb", "bbbb", 5.0, 1590738994000, 15, 29, 15.0, 5.0, 1.0, 3.0, 5, 4 , 3, NULL, NULL] + - [ 6, "a", "aa", "bbb", "bbbb", 6.0, 1590738995000, 15, 29, 21.0, 6.0, 1.0, 3.5, 6, 5 , 4, NULL, NULL] + - [ 7, "a", "bb", "bbb", "bbbb", 7.0, 1590738996000, 15, 29, 28.0, 7.0, 1.0, 4.0, 7, 6 , NULL, NULL, 3] + - [ 8, "a", "bb", "bbb", "bbbb", 8.0, 1590738997000, 15, 29, 36.0, 8.0, 1.0, 4.5, 8, 7 , NULL, 5, 4] + - [ 9, "b", "bb", "bbb", "bbbb", 9.0, 1590738998000, 15, 29, 9.0, 9.0, 9.0, 9.0, 1, NULL , 7, 6, 5] + - [ 10, "b", "bb", "bbb", "bbbb", 10.0,1590738999000,15, 29, 19.0, 10.0, 9.0, 9.5, 2, 9, 8, 7, 6] diff --git a/cases/integration_test/expression/test_type.yaml b/cases/integration_test/expression/test_type.yaml index 295ba024323..9def66de50a 100644 --- a/cases/integration_test/expression/test_type.yaml +++ b/cases/integration_test/expression/test_type.yaml @@ -83,10 +83,8 @@ cases: sql: select cast(c9 as bool) as b1,cast(c2 as smallint) as b2,cast(c3 as int) as b3,cast(c4 as bigint) as b4,cast(c5 as float) as b5,cast(c6 as double) as b6,cast(c7 as timestamp) as b7,cast(c8 as date) as b8,cast(c1 as string) as b9 from {0}; expect: columns: ["b1 bool","b2 smallint","b3 int","b4 bigint","b5 float","b6 double","b7 timestamp","b8 date","b9 string"] - expectProvider: - 0: - rows: - - [false,30,30,30,30.0,30.0,1590738989000,"2020-05-01",aa] + rows: + - [false,30,30,30,30.0,30.0,null,"2020-05-01",aa] - id: 3 desc: "cast_date_正确" # tags: ["TODO","本地成功,CICD失败,原因待定位"] diff --git a/cases/integration_test/function/test_udaf_function.yaml b/cases/integration_test/function/test_udaf_function.yaml index 13cf41d9e43..70e49da5905 100644 --- a/cases/integration_test/function/test_udaf_function.yaml +++ b/cases/integration_test/function/test_udaf_function.yaml @@ -2401,15 +2401,15 @@ cases: - columns: [ "id int","ts timestamp","group1 string","val1 int" ] indexs: [ "index1:group1:ts" ] name: t1 - data: | - 1, 1612130400000, g1, 1 - 2, 1612130401000, g1, 2 - 3, 1612130402000, g1, 3 - 4, 1612130403000, g1, 4 - 5, 1612130404000, g1, 5 - 6, 1612130404000, g2, 4 - 7, 1612130405000, g2, 3 - 8, 1612130406000, g2, 2 + rows: + - [1, 1612130400000, g1, 1] + - [2, 1612130401000, g1, 2] + - [3, 1612130402000, g1, 3] + - [4, 1612130403000, g1, 4] + - [5, 1612130404000, g1, 5] + - [6, 1612130404000, g2, 4] + - [7, 1612130405000, g2, 3] + - [8, 1612130406000, g2, 2] sql: | select `id`, @@ -2440,15 +2440,15 @@ cases: - columns: [ "id int","ts timestamp","group1 string","val1 int" ] indexs: [ "index1:group1:ts" ] name: t1 - data: | - 1, 1612130400000, g1, 1 - 2, 1612130401000, g1, 2 - 3, 1612130402000, g1, 3 - 4, 1612130403000, g1, 4 - 5, 1612130404000, g1, 5 - 6, 1612130405000, g2, 4 - 7, 1612130406000, g2, 3 - 8, 1612130407000, g2, 2 + rows: + - [1, 1612130400000, g1, 1] + - [2, 1612130401000, g1, 2] + - [3, 1612130402000, g1, 3] + - [4, 1612130403000, g1, 4] + - [5, 1612130404000, g1, 5] + - [6, 1612130405000, g2, 4] + - [7, 1612130406000, g2, 3] + - [8, 1612130407000, g2, 2] sql: | select `id`, @@ -2480,15 +2480,15 @@ cases: - columns: [ "id int","ts timestamp","group1 string","val1 int" ] indexs: [ "index1:group1:ts" ] name: t1 - data: | - 1, 1612130400000, g1, 1 - 2, 1612130401000, g1, 2 - 3, 1612130402000, g1, 3 - 4, 1612130403000, g1, 4 - 5, 1612130404000, g1, 5 - 6, 1612130405000, g2, 4 - 7, 1612130406000, g2, 3 - 8, 1612130407000, g2, 2 + rows: + - [1, 1612130400000, g1, 1] + - [2, 1612130401000, g1, 2] + - [3, 1612130402000, g1, 3] + - [4, 1612130403000, g1, 4] + - [5, 1612130404000, g1, 5] + - [6, 1612130405000, g2, 4] + - [7, 1612130406000, g2, 3] + - [8, 1612130407000, g2, 2] sql: | select `id`, @@ -2519,15 +2519,15 @@ cases: - columns: [ "id int","ts timestamp","group1 string","val1 int" ] indexs: [ "index1:group1:ts" ] name: t1 - data: | - 1, 1612130400000, g1, 1 - 2, 1612130401000, g1, 2 - 3, 1612130402000, g1, 3 - 4, 1612130403000, g1, 4 - 5, 1612130404000, g1, 5 - 6, 1612130405000, g2, 4 - 7, 1612130406000, g2, 3 - 8, 1612130407000, g2, 2 + rows: + - [1, 1612130400000, g1, 1] + - [2, 1612130401000, g1, 2] + - [3, 1612130402000, g1, 3] + - [4, 1612130403000, g1, 4] + - [5, 1612130404000, g1, 5] + - [ 6, 1612130405000, g2, 4] + - [7, 1612130406000, g2, 3] + - [8, 1612130407000, g2, 2] sql: | select `id`, diff --git a/cases/integration_test/join/test_lastjoin_simple.yaml b/cases/integration_test/join/test_lastjoin_simple.yaml index fa58a927859..9795d4a0d70 100644 --- a/cases/integration_test/join/test_lastjoin_simple.yaml +++ b/cases/integration_test/join/test_lastjoin_simple.yaml @@ -123,7 +123,10 @@ cases: - [ "cc",41,121,1590738991000 ] sql: select {0}.c1,{0}.c2,{1}.c3,{1}.c4 from {0} last join {1} on {0}.c1={1}.c1; expect: - success: false + columns: [ "c1 string","c2 int","c3 bigint","c4 timestamp" ] + rows: + - [ "aa",2,13,1590738989000 ] + - [ "bb",21,131,1590738990000 ] - id: 4-2 desc: Last Join 无order by, 部分拼表条件命中索引 inputs: @@ -172,7 +175,12 @@ cases: - [ "cc",41,121,1590738991000 ] sql: select {0}.c1,{0}.c2,{1}.c3,{1}.c4 from {0} last join {1} on {0}.c1={1}.c1 and {0}.c4={1}.c4; expect: - success: false + columns: [ "c1 string","c2 int","c3 bigint","c4 timestamp" ] + rows: + - [ "aa",2,13,1590738989000 ] + - [ "aa",20,15,1590738991000 ] + - [ "bb",21,131,1590738990000 ] + - [ "dd",41,null,null ] - id: 4-4 desc: Last Join 无order by, 拼表条件命中部分的组合索引(后缀索引) mode: non-performance-sensitive-unsupport, offline-unsupport @@ -194,7 +202,12 @@ cases: - [ "cc",41,121,1590738991000 ] sql: select {0}.c1,{0}.c2,{1}.c3,{1}.c4 from {0} last join {1} on {0}.c1={1}.c1 and {0}.c4={1}.c4; expect: - success: false + columns: [ "c1 string","c2 int","c3 bigint","c4 timestamp" ] + rows: + - [ "aa",2,13,1590738989000 ] + - [ "aa",20,15,1590738991000 ] + - [ "bb",21,131,1590738990000 ] + - [ "dd",41,null,null ] - id: 4-5 desc: Last Join 无order by, 拼表条件命中索引, 副表多条命中 tags: [ "注意offline随机拼接最后一条,改变结果顺序可能导致Spark结果不符合预期" ] diff --git a/cases/integration_test/long_window/test_long_window_batch.yaml b/cases/integration_test/long_window/test_long_window_batch.yaml index 60c938490d4..497640b318f 100644 --- a/cases/integration_test/long_window/test_long_window_batch.yaml +++ b/cases/integration_test/long_window/test_long_window_batch.yaml @@ -19,6 +19,7 @@ cases: - id: 0 desc: options格式错误 + mode: apiserver-unsupport inputs: - columns : ["id int","c1 string","c3 int","c4 bigint","c5 float","c6 double","c7 timestamp","c8 date"] diff --git a/cases/integration_test/window/error_window.yaml b/cases/integration_test/window/error_window.yaml index ce2fc32983f..798c66673d2 100644 --- a/cases/integration_test/window/error_window.yaml +++ b/cases/integration_test/window/error_window.yaml @@ -26,7 +26,7 @@ cases: sql: | SELECT id, c1, c4, count(c4) OVER w1 as w1_c4_count FROM {0} WINDOW w1 AS (PARTITION BY {0}.c8 ROWS BETWEEN 2 PRECEDING AND CURRENT ROW); expect: - success: false + success: true - id: 1 desc: no partition by inputs: diff --git a/cases/plan/alter.yaml b/cases/plan/alter.yaml new file mode 100644 index 00000000000..f954577506d --- /dev/null +++ b/cases/plan/alter.yaml @@ -0,0 +1,44 @@ +# Copyright 2021 4Paradigm +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +cases: + - id: alter_user + desc: alter user + sql: | + alter user root set options(password="123456"); + expect: + node_tree_str: | + +-node[kAlterUserStmt] + +-if_exists: false + +-user: root + +-options: + +-password: + +-expr[primary] + +-value: 123456 + +-type: string + + - id: alter_user_if_exist + desc: alter user + sql: | + alter user if exists root set options(password="123456"); + expect: + node_tree_str: | + +-node[kAlterUserStmt] + +-if_exists: true + +-user: root + +-options: + +-password: + +-expr[primary] + +-value: 123456 + +-type: string diff --git a/cases/plan/back_quote_identifier.yaml b/cases/plan/back_quote_identifier.yaml index cafce9e5b2d..4743634c370 100644 --- a/cases/plan/back_quote_identifier.yaml +++ b/cases/plan/back_quote_identifier.yaml @@ -131,12 +131,10 @@ cases: | | +-node[kColumnDesc] | | +-column_name: a-1 | | +-column_type: int32 - | | +-NOT NULL: 0 | +-1: | | +-node[kColumnDesc] | | +-column_name: b-1 | | +-column_type: string - | | +-NOT NULL: 0 | +-2: | +-node[kColumnIndex] | +-keys: [a-1, b-1] diff --git a/cases/plan/cmd.yaml b/cases/plan/cmd.yaml index 58eb872268f..bcffc51507c 100644 --- a/cases/plan/cmd.yaml +++ b/cases/plan/cmd.yaml @@ -189,6 +189,23 @@ cases: +-cmd_type: drop database +-if_exists: true +-args: [db1] + - id: drop_user + desc: DROP USER IF EXISTS + sql: DROP USER IF EXISTS user1 + expect: + node_tree_str: | + +-node[CMD] + +-cmd_type: drop user + +-if_exists: true + +-args: [user1] + - id: drop_user_1 + desc: DROP USER + sql: DROP USER user1 + expect: + node_tree_str: | + +-node[CMD] + +-cmd_type: drop user + +-args: [user1] - id: show_deployments desc: show deployments sql: SHOW DEPLOYMENTS; diff --git a/cases/plan/const_query.yaml b/cases/plan/const_query.yaml index 98ce9ff2119..c388f163b7c 100644 --- a/cases/plan/const_query.yaml +++ b/cases/plan/const_query.yaml @@ -39,3 +39,36 @@ cases: mode: request-unsupport sql: | select int(NULL) as c1, bigint(NULL) as c2, float(NULL) as c3, double(NULL) as c4, timestamp(NULL) as c5, date(NULL) as c6, string(NULL) as c7; + + - id: map_data_type + mode: request-unsupport + desc: access map value with []operator + sql: | + select map(1, 2)[1] + expect: + node_tree_str: | + +-node[kQuery]: kQuerySelect + +-distinct_opt: false + +-where_expr: null + +-group_expr_list: null + +-having_expr: null + +-order_expr_list: null + +-limit: null + +-select_list[list]: + | +-0: + | +-node[kResTarget] + | +-val: + | | map(1, 2)[1] + | +-name: + +-tableref_list: [] + +-window_list: [] + plan_tree_str: | + +-[kQueryPlan] + +-[kProjectPlan] + +-table: + +-project_list_vec[list]: + +-[kProjectList] + +-projects on table [list]: + +-[kProjectNode] + +-[0]map(1, 2)[1]: map(1, 2)[1] + null diff --git a/cases/plan/create.yaml b/cases/plan/create.yaml index 66bb1ee548c..00c7e583406 100644 --- a/cases/plan/create.yaml +++ b/cases/plan/create.yaml @@ -163,12 +163,10 @@ cases: | | +-node[kColumnDesc] | | +-column_name: a | | +-column_type: int32 - | | +-NOT NULL: 0 | +-1: | | +-node[kColumnDesc] | | +-column_name: b | | +-column_type: string - | | +-NOT NULL: 0 | +-2: | +-node[kColumnIndex] | +-keys: [a, b] @@ -218,12 +216,10 @@ cases: | | +-node[kColumnDesc] | | +-column_name: a | | +-column_type: int16 - | | +-NOT NULL: 0 | +-1: | | +-node[kColumnDesc] | | +-column_name: b | | +-column_type: float - | | +-NOT NULL: 0 | +-2: | +-node[kColumnIndex] | +-keys: [a] @@ -274,12 +270,10 @@ cases: | | +-node[kColumnDesc] | | +-column_name: a | | +-column_type: int32 - | | +-NOT NULL: 0 | +-1: | | +-node[kColumnDesc] | | +-column_name: b | | +-column_type: timestamp - | | +-NOT NULL: 0 | +-2: | +-node[kColumnIndex] | +-keys: [a] @@ -627,12 +621,10 @@ cases: | | +-node[kColumnDesc] | | +-column_name: a | | +-column_type: int32 - | | +-NOT NULL: 0 | +-1: | | +-node[kColumnDesc] | | +-column_name: b | | +-column_type: timestamp - | | +-NOT NULL: 0 | +-2: | +-node[kColumnIndex] | +-keys: [a] @@ -685,33 +677,27 @@ cases: | +-0: | | +-node[kColumnDesc] | | +-column_name: column1 - | | +-column_type: int32 - | | +-NOT NULL: 1 + | | +-column_type: int32 NOT NULL | +-1: | | +-node[kColumnDesc] | | +-column_name: column2 - | | +-column_type: int16 - | | +-NOT NULL: 1 + | | +-column_type: int16 NOT NULL | +-2: | | +-node[kColumnDesc] | | +-column_name: column5 - | | +-column_type: string - | | +-NOT NULL: 1 + | | +-column_type: string NOT NULL | +-3: | | +-node[kColumnDesc] | | +-column_name: column6 - | | +-column_type: string - | | +-NOT NULL: 1 + | | +-column_type: string NOT NULL | +-4: | | +-node[kColumnDesc] | | +-column_name: std_ts - | | +-column_type: timestamp - | | +-NOT NULL: 1 + | | +-column_type: timestamp NOT NULL | +-5: | | +-node[kColumnDesc] | | +-column_name: std_date - | | +-column_type: date - | | +-NOT NULL: 1 + | | +-column_type: date NOT NULL | +-6: | +-node[kColumnIndex] | +-keys: [column2] @@ -743,33 +729,27 @@ cases: | +-0: | | +-node[kColumnDesc] | | +-column_name: column1 - | | +-column_type: int32 - | | +-NOT NULL: 1 + | | +-column_type: int32 NOT NULL | +-1: | | +-node[kColumnDesc] | | +-column_name: column2 - | | +-column_type: int16 - | | +-NOT NULL: 1 + | | +-column_type: int16 NOT NULL | +-2: | | +-node[kColumnDesc] | | +-column_name: column5 - | | +-column_type: string - | | +-NOT NULL: 1 + | | +-column_type: string NOT NULL | +-3: | | +-node[kColumnDesc] | | +-column_name: column6 - | | +-column_type: string - | | +-NOT NULL: 1 + | | +-column_type: string NOT NULL | +-4: | | +-node[kColumnDesc] | | +-column_name: std_ts - | | +-column_type: timestamp - | | +-NOT NULL: 1 + | | +-column_type: timestamp NOT NULL | +-5: | | +-node[kColumnDesc] | | +-column_name: std_date - | | +-column_type: date - | | +-NOT NULL: 1 + | | +-column_type: date NOT NULL | +-6: | +-node[kColumnIndex] | +-keys: [column2] @@ -796,17 +776,11 @@ cases: | +-0: | | +-node[kColumnDesc] | | +-column_name: column1 - | | +-column_type: int32 - | | +-NOT NULL: 0 - | | +-default_value: - | | +-expr[primary] - | | +-value: 1 - | | +-type: int32 + | | +-column_type: int32 DEFAULT 1 | +-1: | +-node[kColumnDesc] | +-column_name: column2 | +-column_type: int32 - | +-NOT NULL: 0 +-table_option_list: [] - id: 27 desc: Column default value with explicit type @@ -824,20 +798,11 @@ cases: | +-0: | | +-node[kColumnDesc] | | +-column_name: column1 - | | +-column_type: string - | | +-NOT NULL: 0 - | | +-default_value: - | | +-expr[cast] - | | +-cast_type: string - | | +-expr: - | | +-expr[primary] - | | +-value: 1 - | | +-type: int32 + | | +-column_type: string DEFAULT string(1) | +-1: | +-node[kColumnDesc] | +-column_name: column3 | +-column_type: int32 - | +-NOT NULL: 0 +-table_option_list: [] - id: 28 desc: Create table with database.table @@ -856,12 +821,10 @@ cases: | | +-node[kColumnDesc] | | +-column_name: column1 | | +-column_type: string - | | +-NOT NULL: 0 | +-1: | +-node[kColumnDesc] | +-column_name: column3 | +-column_type: int32 - | +-NOT NULL: 0 +-table_option_list: [] - id: 29 desc: create index with db name prefix @@ -898,12 +861,10 @@ cases: | | +-node[kColumnDesc] | | +-column_name: column1 | | +-column_type: int32 - | | +-NOT NULL: 0 | +-1: | | +-node[kColumnDesc] | | +-column_name: column2 | | +-column_type: timestamp - | | +-NOT NULL: 0 | +-2: | +-node[kColumnIndex] | +-keys: [column1] @@ -934,12 +895,10 @@ cases: | | +-node[kColumnDesc] | | +-column_name: a | | +-column_type: int32 - | | +-NOT NULL: 0 | +-1: | | +-node[kColumnDesc] | | +-column_name: b | | +-column_type: timestamp - | | +-NOT NULL: 0 | +-2: | +-node[kColumnIndex] | +-keys: [a] @@ -1049,12 +1008,10 @@ cases: | | +-node[kColumnDesc] | | +-column_name: column1 | | +-column_type: int32 - | | +-NOT NULL: 0 | +-1: | | +-node[kColumnDesc] | | +-column_name: column2 | | +-column_type: timestamp - | | +-NOT NULL: 0 | +-2: | +-node[kColumnIndex] | +-keys: [column1] @@ -1068,3 +1025,83 @@ cases: +-0: +-node[kCompressType] +-compress_type: snappy + + - id: 35 + desc: Create table with array & map type + sql: | + create table t1 (id int, + member ARRAY NOT NULL, + attrs MAP NOT NULL); + expect: + node_tree_str: | + +-node[CREATE] + +-table: t1 + +-IF NOT EXIST: 0 + +-column_desc_list[list]: + | +-0: + | | +-node[kColumnDesc] + | | +-column_name: id + | | +-column_type: int32 + | +-1: + | | +-node[kColumnDesc] + | | +-column_name: member + | | +-column_type: array NOT NULL + | +-2: + | +-node[kColumnDesc] + | +-column_name: attrs + | +-column_type: map NOT NULL + +-table_option_list: [] + plan_tree_str: | + +-[kCreatePlan] + +-table: t1 + +-column_desc_list[list]: + | +-0: + | | +-node[kColumnDesc] + | | +-column_name: id + | | +-column_type: int32 + | +-1: + | | +-node[kColumnDesc] + | | +-column_name: member + | | +-column_type: array NOT NULL + | +-2: + | +-node[kColumnDesc] + | +-column_name: attrs + | +-column_type: map NOT NULL + +-table_option_list: [] + + - id: 36 + desc: create user + sql: | + create user root; + expect: + node_tree_str: | + +-node[kCreateUserStmt] + +-if_not_exists: false + +-user: root + +-options: + + - id: 37 + desc: create user if not exist + sql: | + create user if not exists root; + expect: + node_tree_str: | + +-node[kCreateUserStmt] + +-if_not_exists: true + +-user: root + +-options: + + - id: create_user_passwd + desc: create user with password + sql: | + create user root OPTIONS (password="123456"); + expect: + node_tree_str: | + +-node[kCreateUserStmt] + +-if_not_exists: false + +-user: root + +-options: + +-password: + +-expr[primary] + +-value: 123456 + +-type: string diff --git a/cases/plan/simple_query.yaml b/cases/plan/simple_query.yaml index 66cc542fbc0..95372e7803f 100644 --- a/cases/plan/simple_query.yaml +++ b/cases/plan/simple_query.yaml @@ -644,3 +644,4 @@ cases: +-[kTablePlan] +-table: t +-alias: t1 + diff --git a/cases/query/udf_query.yaml b/cases/query/udf_query.yaml index ded80e003ce..c2fdc4678de 100644 --- a/cases/query/udf_query.yaml +++ b/cases/query/udf_query.yaml @@ -554,3 +554,37 @@ cases: - c1 bool data: | true, false + + # ================================================================ + # Map data type + # ================================================================ + - id: 13 + mode: request-unsupport + sql: | + select + map(1, "2")[1] as e1, + map("abc", 100)["abc"] as e2, + map(1, "2", 3, "4")[5] as e3, + map("c", 99, "d", 101)["d"] as e4, + map(date("2012-12-12"), "e", date("2013-11-11"), "f", date("2014-10-10"), "g")[date("2013-11-11")] as e5, + map(timestamp(88), timestamp(1000), timestamp(99), timestamp(2000)) [timestamp(99)] as e6, + map('1', 2, '3', 4, '5', 6, '7', 8, '9', 10, '11', 12)['9'] as e7, + map('1', 2, '3', 4, '5', 6, '7', 8, '9', 10, '11', 12)['10'] as e8, + # first match on duplicate keys + map('1', 2, '1', 4, '1', 6, '7', 8, '9', 10, '11', 12)['1'] as e9, + map("c", 99, "d", NULL)["d"] as e10, + expect: + columns: ["e1 string", "e2 int", "e3 string", "e4 int", "e5 string", "e6 timestamp", "e7 int", "e8 int", "e9 int", "e10 int"] + data: | + 2, 100, NULL, 101, f, 2000, 10, NULL, 2, NULL + - id: 14 + mode: request-unsupport + sql: | + select + array_contains(map_keys(map(1, '2', 3, '4')), 1) as e1, + array_contains(map_keys(map('1', 2, '3', 4)), '2') as e2, + array_contains(map_keys(map(timestamp(88), timestamp(1000), timestamp(99), timestamp(2000))) , timestamp(99)) as e3, + expect: + columns: ["e1 bool", "e2 bool", "e3 bool"] + data: | + true, false, true diff --git a/demo/usability_testing/data_mocker.py b/demo/usability_testing/data_mocker.py index f873daec9dc..6729ef0b70b 100644 --- a/demo/usability_testing/data_mocker.py +++ b/demo/usability_testing/data_mocker.py @@ -6,6 +6,7 @@ from typing import Optional import numpy as np import pandas as pd +import dateutil # to support save csv, and faster parquet, we don't use faker-cli directly @@ -146,8 +147,9 @@ def type_converter(sql_type): if sql_type in ['varchar', 'string']: # TODO(hw): set max length return 'pystr', {} + # timestamp should > 0 cuz tablet insert will check it, use utc if sql_type in ['date', 'timestamp']: - return 'iso8601', {} + return 'iso8601', {"tzinfo": dateutil.tz.UTC} if sql_type in ['float', 'double']: return 'pyfloat', ranges[sql_type] return 'py' + sql_type, {} diff --git a/docker/Dockerfile b/docker/Dockerfile index 9faef4db550..f60763b1918 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -15,8 +15,8 @@ FROM centos:7 -ARG ZETASQL_VERSION=0.3.1 -ARG THIRDPARTY_VERSION=0.6.0 +ARG ZETASQL_VERSION=0.3.3 +ARG THIRDPARTY_VERSION=0.6.1 ARG TARGETARCH LABEL org.opencontainers.image.source https://github.com/4paradigm/OpenMLDB diff --git a/docs/en/about/OpenMLDB_intro_2024.pdf b/docs/en/about/OpenMLDB_intro_2024.pdf new file mode 100644 index 00000000000..294828823ad Binary files /dev/null and b/docs/en/about/OpenMLDB_intro_2024.pdf differ diff --git a/docs/en/about/index.rst b/docs/en/about/index.rst index 6f8d3ae80d1..90d326d0534 100644 --- a/docs/en/about/index.rst +++ b/docs/en/about/index.rst @@ -7,6 +7,6 @@ About intro community - Milestones - Change Logs - + milestones + change_logs + Introduction Slides diff --git a/docs/en/deploy/install_deploy.md b/docs/en/deploy/install_deploy.md index 5c94f439ca5..3133c6a72ab 100644 --- a/docs/en/deploy/install_deploy.md +++ b/docs/en/deploy/install_deploy.md @@ -560,6 +560,8 @@ cp conf/apiserver.flags.template conf/apiserver.flags * Modify the `endpoint`. The `endpoint` consists of a colon-separated deployment machine IP/domain name and port number (endpoints cannot use 0.0.0.0 and 127.0.0.1, and must be a public IP). * Modify `zk_cluster` to point to the address of the ZooKeeper service that has already been started (see [Deploy ZooKeeper - 4. ZooKeeper Service Address and Connection Test](zookeeper_addr)). If the ZooKeeper service is a cluster, separate the addresses with commas, for example, `172.27.128.33:7181,172.27.128.32:7181,172.27.128.31:7181`. * Modify `zk_root_path`. In this example, `/openmldb_cluster` is used. Note that **components under the same cluster share the same `zk_root_path`**. So in this deployment, the `zk_root_path` for each component's configuration is `/openmldb_cluster`. +* You can specify the username and password to connect to the server using `--user` and `--password`. +* By default, it connects to the server using the root user and an empty password. If you've changed the root password, you need to specify the new password using `--password`. ``` --endpoint=172.27.128.33:8080 @@ -636,6 +638,7 @@ cp conf/taskmanager.properties.template conf/taskmanager.properties * Modify `offline.data.prefix`: Set it to the storage path for offline tables. In Yarn mode, modify it to the corresponding HDFS path. * Modify `spark.master`: Set it according to the desired mode. Currently supports local and yarn modes for running offline tasks. * Modify `spark.home`: Set it to the Spark environment path. If not configured, the `SPARK_HOME` environment variable will be used. It should be the directory where the spark-optimized package was extracted in the first step, and it must be an absolute path. +* You can specify the username and password to connect to the server using `user` and `password`. If you've changed the root password, you'll need to specify the new password for the root user. ``` server.host=172.27.128.33 diff --git a/docs/en/developer/built_in_function_develop_guide.md b/docs/en/developer/built_in_function_develop_guide.md index 97d00076f87..3041a789267 100644 --- a/docs/en/developer/built_in_function_develop_guide.md +++ b/docs/en/developer/built_in_function_develop_guide.md @@ -6,17 +6,20 @@ OpenMLDB contains hundreds of built-in functions that help data scientists extra OpenMLDB classifies functions as aggregate or scalar depending on the input data values and result values. -- An *aggregate function* receives **a set of** values for each argument (such as the values of a column) and returns a single-value result for the set of input values. - - A *scalar function* receives **a single value** for each argument and returns a single value result. A scalar function can be classified into several groups: - -- - Mathematical function + - Mathematical function - Logical function - Date & Time function - String function - Conversion function -This article is a hands-on guide for built-in scalar function development in OpenMLDB. We will not dive into aggregate function development in detail. We truly welcome developers who want to join our community and help extend our functions. +- An *aggregate function* receives **a set of** values for each argument (such as the values of a column) and returns a single-value result for the set of input values. + +This article serves as an introductory guide to developing SQL built-in functions, aiming to guide developers in quickly grasping the basic methods of developing custom functions. + +First, we will provide a detailed overview of the development steps, classification, and examples of scalar function development. This will enable developers to understand the basic development and registration patterns of custom functions. + +Subsequently, we will transition to the details of developing complex aggregate functions. We sincerely welcome more developers to join our community and assist us in expanding and developing the built-in function collection. ## 2. Develop a Built-In SQL Function @@ -34,13 +37,39 @@ Developers need to **take care of the following** rules when developing a functi #### 2.1.1 Code Location -Developers can declare function in [hybridse/src/udf/udf.h](https://github.com/4paradigm/OpenMLDB/blob/main/hybridse/src/udf/udf.h) and implement it in [hybridse/src/udf/udf.cc](https://github.com/4paradigm/OpenMLDB/blob/main/hybridse/src/udf/udf.cc) within namespace `hybridse::udf::v1`. +Developers can declare function in [hybridse/src/udf/udf.h](https://github.com/4paradigm/OpenMLDB/blob/main/hybridse/src/udf/udf.h) and implement it in [hybridse/src/udf/udf.cc](https://github.com/4paradigm/OpenMLDB/blob/main/hybridse/src/udf/udf.cc). +If the function is complex, developers can declare and implement in separate `.h` and `.cc` files in [hybridse/src/udf/](https://github.com/4paradigm/OpenMLDB/blob/main/hybridse/src/udf/). + +The functions are usually within namespace `hybridse::udf::v1`. + +- ```c++ + # hybridse/src/udf/udf.h + namespace hybridse { + namespace udf { + namespace v1 { + // declare built-in function + } // namespace v1 + } // namespace udf + } // namespace hybridse + ``` + +- ```c++ + # hybridse/src/udf/udf.cc + namespace hybridse { + namespace udf { + namespace v1 { + // implement built-in function + } // namespace v1 + } // namespace udf + } // namespace hybridse + ``` #### 2.1.2 C++ Function Naming Rules - Function names are all lowercase, with underscores between words. Check [snake_case](https://en.wikipedia.org/wiki/Snake_case) for more details. - Function names should be clear and readable. Use names that describe the purpose or intent of the function. +(c_vs_sql)= #### 2.1.3 C++ and SQL Data Type C++ built-in functions can use limited data types, including BOOL, Numeric, String, Timestamp and Date. The correspondence between the SQL data type and the C++ data type is shown as follows: @@ -64,7 +93,7 @@ C++ built-in functions can use limited data types, including BOOL, Numeric, Stri - SQL function parameters and C++ function parameters have the same position order. -- C++ function parameter types should match the SQL types. Check [2.1.3 C++ and SQL Data Type](#2.1.3-C++ and SQL Data Type) for more details. +- C++ function parameter types should match the SQL types. Check [2.1.3 C++ and SQL Data Type](c_vs_sql) for more details. - SQL function return type: @@ -89,12 +118,17 @@ C++ built-in functions can use limited data types, including BOOL, Numeric, Stri void func_output_nullable_date(int64_t, codec::Date*, bool*); ``` - - Notice that return types have greater impact on built-in function developing behaviours. We will cover the details in a later section [3. Built-in Function Development Template](#3.-Built-in Function Development Template). + - Notice that return types have greater impact on built-in function developing behaviours. We will cover the details in a later section [3.2 Scalar Function Development Classification](sfunc_category). + +- Handling Nullable Parameters: + - Generally, OpenMLDB adopts a uniform approach to handling NULL parameters for all built-in scalar functions. That is, if any input parameter is NULL, the function will directly return NULL. + - However, for scalar functions or aggregate functions that require special handling of NULL parameters, you can configure the parameter as `Nullable`. In the C++ function, you will then use the corresponding C++ type of ArgType and `bool*` to express this parameter. For more details, refer to [3.2.4 Nullable SQL Function Parameters](arg_nullable). #### 2.1.5 Memory Management - Operator `new` operator or method `malloc` are forbidden in C++ built-in function implementation. -- Developers must call provided memory management APIs in order to archive space allocation for output parameters: +- In C++ built-in aggregate functions, it is permissible to use the `new` or `malloc` functions to allocate memory during initialization. However, it is crucial to ensure that the allocated space is released when the `output` generates the final result. +- Developers must call provided memory management APIs in order to archive space allocation for UDF output parameters: - `hybridse::udf::v1::AllocManagedStringBuf(size)` to allocate space. OpenMLDB `ByteMemoryPool` will assign continous space to the function and will release it when safe. - If allocated size < 0, allocation will fail. `AllocManagedStringBuf` return null pointer. - If allocated size exceed the MAX_ALLOC_SIZE which is 2048, the allocation will fail. `AllocManagedStringBuf` return null pointer. @@ -147,33 +181,16 @@ OpenMLDB `DefaultUdfLibrary` stores and manages the global built-in SQL functio - The SQL function name does not have to be the same as the C++ function name, since the SQL function name will be linked to the C++ function via the registry. - SQL function names are case-insensitive. For instance, given register name "aaa_bb", the users can access it by calling `AAA_BB()`, `Aaa_Bb()`, `aAa_bb()` in SQL. -#### 2.2.3 Register and Configure Function - -`DefaultUdfLibrary::RegisterExternal` create an instance of `ExternalFuncRegistryHelper` with a name. The name will be the function's registered name. +#### 2.2.3 Register Function Interface -```c++ -ExternalFuncRegistryHelper helper = RegisterExternal("register_func_name"); -// ... ignore function configuration details -``` +- Registration Scalar functions: + - For scalar function with a single inout type: `RegisterExternal("register_func_name")` + - For generic function that supports multiple types: `RegisterExternalTemplate("register_func_name")` +- Registration for aggregate functions: + - For aggregate function with a single inout type:`RegisterUdaf("register_func_name")` + - For generic function that supports multiple types:`RegisterUdafTemplate("register_func_name")` - `ExternalFuncRegistryHelper` provides a set of APIs to help developers to configure the functions and register it into the *default library*. - -```c++ -RegisterExternal("register_func_name") - .args(built_in_fn_pointer) - .return_by_arg(bool_value) - .returns - .doc(documentation) -``` - -- `args`: Configure argument types. -- `built_in_fn_pointer`: Built-in function pointer. -- `returns`: Configure return type. Notice that when function result is Nullable, we should configure ***return type*** as ***returns>*** explicitly. -- `return_by_arg()` : Configure whether return value will be store in parameters or not. - - When **return_by_arg(false)** , result will be return directly. OpenMLDB configure `return_by_arg(false) ` by default. - - When **return_by_arg(true)**, the result will be stored and returned by parameters. - - if the return type is ***non-nullable***, the result will be stored and returned via the last parameter. - - if the return type is **nullable**, the ***result value*** will be stored in the second-to-last parameter and the ***null flag*** will be stored in the last parameter. if ***null flag*** is true, function result is **null**, otherwise, function result is obtained from second-to-last parameter. +The specific interface definitions will be elaborated in detail in the following sections. #### 2.2.4 Documenting Function @@ -187,12 +204,12 @@ Function docstrings should contain the following information: - **@since** command to specify the production version when the function was added to OpenMLDB. The version can be obtained from the project's [CMakeList.txt](https://github.com/4paradigm/OpenMLDB/blob/main/CMakeLists.txt): ` ${OPENMLDB_VERSION_MAJOR}.${OPENMLDB_VERSION_MINOR}.${OPENMLDB_VERSION_BUG}` ```c++ -RegisterExternal("register_func_name") +RegisterExternal("function_name") //... .doc(R"( - @brief a brief summary of the my_function's purpose and behavior + @brief a brief summary of the my_function's purpose and behavior - @param param1 a brief description of param1 + @param param1 a brief description of param1 Example: @@ -203,15 +220,117 @@ RegisterExternal("register_func_name") @since 0.4.0)"); ``` -#### 2.2.5 RegisterAlias +#### 2.2.5 Register Alias Sometimes, we don't have to implement and register a function when it is an alias to another function that already exists in the default library. We can simply use api `RegisterAlias("alias_func", "original_func")` to link the current register function name with an existing registered name. ```c++ +// substring() is registered into default library already +RegisterAlias("substr", "substring"); +``` + +## 2.3 Function Unit Test + +Once a function is registered/developed, the developer should add some related unit tests to make sure everything is going well. + +#### 2.3.1 Add Unit Tests + +Generally, developers can test scalar functions with [src/codegen/udf_ir_builder_test.cc](https://github.com/4paradigm/OpenMLDB/blob/main/hybridse/src/codegen/udf_ir_builder_test.cc), and test aggregate functions by adding `TEST_F` cases to [src/codegen/udf_ir_builder_test.cc](https://github.com/4paradigm/OpenMLDB/blob/main/hybridse/src/codegen/udf_ir_builder_test.cc). OpenMLDB provides `CheckUdf` so that the developer can perform function checking easily. + +```c++ +CheckUdf("function_name", expect_result, arg_value,...); +``` + +For each function signature, we at least have to: + +- Add a unit test with a normal result +- If parameter is ***nullable***, add a unit test with NULL input to produce a normal result +- Add a unit test with a null result if the result is **nullable** + +**Example**: +- Add unit test in [hybridse/src/codegen/udf_ir_builder_test.cc](https://github.com/4paradigm/OpenMLDB/blob/main/hybridse/src/codegen/udf_ir_builder_test.cc): + ```c++ + // month(timestamp) normal check + TEST_F(UdfIRBuilderTest, month_timestamp_udf_test) { + Timestamp time(1589958000000L); + CheckUdf("month", 5, time); + } + + // date(timestamp) normal check + TEST_F(UdfIRBuilderTest, timestamp_to_date_test_0) { + CheckUdf, Nullable>( + "date", Date(2020, 05, 20), Timestamp(1589958000000L)); + } + // date(timestamp) null check + TEST_F(UdfIRBuilderTest, timestamp_to_date_test_null_0) { + CheckUdf, Nullable>("date", nullptr, nullptr); + } + ``` + +- Add unit test in [hybridse/src/udf/udaf_test.cc](https://github.com/4paradigm/OpenMLDB/blob/main/hybridse/src/udf/udaf_test.cc): + ```c++ + // avg udaf test + TEST_F(UdafTest, avg_test) { + CheckUdf>("avg", 2.5, MakeList({1, 2, 3, 4})); + } + ``` + +(compile_ut)= +#### 2.3.2 Compile and Test + +- Compile `udf_ir_builder_test` and test + ```bash + # Compile udf_ir_builder_test, default output path is build/hybridse/src/codegen/udf_ir_builder_test + make OPENMLDB_BUILD_TARGET=udf_ir_builder_test TESTING_ENABLE=ON + + # Run test, note that environment variable SQL_CASE_BASE_DIR need to be specified as OpenMLDB project path + SQL_CASE_BASE_DIR=${OPENMLDB_DIR} ./build/hybridse/src/codegen/udf_ir_builder_test + ``` +- Compile `udaf_test` and test + ```bash + # Compile udaf_test, default output path is build/hybridse/src/udf/udaf_test + make OPENMLDB_BUILD_TARGET=udaf_test TESTING_ENABLE=ON + + # Run test, note that environment variable SQL_CASE_BASE_DIR need to be specified as OpenMLDB project path + SQL_CASE_BASE_DIR=${OPENMLDB_DIR} ./build/hybridse/src/udf/udaf_test + ``` + +If testing is to be done through SDK or command line, `OpenMLDB` needs to be recompiled. For compilation, refer to [compile.md](../deploy/compile.md). + +## 3. Scalar Function Development +### 3.1 Registration and Interface Configuration + +#### 3.1.1 Registration of Scalar Function Supporting Single Data Type + +The `DefaultUdfLibrary` provides the `RegisterExternal` interface to facilitate the registration of built-in scalar functions and initialize the registration name of the function. This method requires specifying a data type and only supports declared data types. + + +```c++ +RegisterExternal("register_func_name") + .args(static_cast(v1::func_ptr)) + .return_by_arg(bool_value) + .returns +``` + +The configuration of a function generally includes: function pointer configuration, parameter type configuration, and return value configuration. + +- Configuring the C++ function pointer: `func_ptr`. It is important to use static_cast to convert the pointer to a function pointer, considering code readability and compile-time safety. +- Configuring parameter types: `args`. +- Configuring return value type: `returns`. Typically, it is not necessary to explicitly specify the return type. However, if the function result is nullable, you need to explicitly configure the ***return type*** as ***returns>***. +- Configuring the return method: `return_by_arg()`. + - When **return_by_arg(false)**, the result is directly returned through the `return` statement. OpenMLDB defaults to `return_by_arg(false)`. + - When **return_by_arg(true)**, the result is returned through parameters: + - If the return type is ***non-nullable***, the function result is returned through the last parameter. + - If the return type is ***nullable***, the function result value is returned through the second-to-last parameter, and the ***null flag*** is returned through the last parameter. If the ***null flag*** is ***true***, the function result is ***null***; otherwise, the function result is retrieved from the second-to-last parameter. + +The following code demonstrates an example of registering the built-in single-row function `substring`. You can find the code in [hybridse/src/udf/default_udf_library.cc](https://github.com/4paradigm/OpenMLDB/blob/main/hybridse/src/udf/default_udf_library.cc). +```c++ +// void sub_string(StringRef *str, int32_t from, StringRef *output); + RegisterExternal("substring") .args( - static_cast(udf::v1::sub_string)) + static_cast(udf::v1::sub_string)) .return_by_arg(true) .doc(R"( @brief Return a substring `len` characters long from string str, starting at position `pos`. @@ -235,81 +354,62 @@ RegisterExternal("substring") @param len length of substring. If len is less than 1, the result is the empty string. @since 0.1.0)"); - -// substring() is registered into default library already -RegisterAlias("substr", "substring"); ``` -## 2.3 Function Unit Test +#### 3.1.2 Registration of Built-In Functions Supporting Generic Templates +We also provide the `RegisterExternalTemplate` interface to support the registration of generic built-in single-row functions, allowing simultaneous support for multiple data types. -Once a function is registered/developed, the developer should add some related unit tests to make sure everything is going well. - -#### 2.3.1 Add Unit Tests +```c++ +RegisterExternalTemplate("register_func_name") + .args_in() + .return_by_arg(bool_value) +``` -Generally, developers can add `TEST_F` cases to [src/codegen/udf_ir_builder_test.cc](https://github.com/4paradigm/OpenMLDB/blob/main/hybridse/src/codegen/udf_ir_builder_test.cc). +The configuration of a function generally includes: function template configuration, supported parameter types configuration, and return method configuration. -OpenMLDB provides `CheckUdf` in [src/codegen/udf_ir_builder_test.cc](https://github.com/4paradigm/OpenMLDB/blob/main/hybridse/src/codegen/udf_ir_builder_test.cc) so that the developer can perform function checking easily. +- Configuring the function template: `TemplateClass`. +- Configuring supported parameter types: `args_in`. +- Configuring the return method: `return_by_arg()` + - When **return_by_arg(false)**, the result is directly returned through the `return` statement. OpenMLDB defaults to `return_by_arg(false)`. + - When **return_by_arg(true)**, the result is returned through parameters. +The following code shows the code example of registering `abs` scalar function (code can be found at [hybridse/src/udf/default_udf_library.cc](https://github.com/4paradigm/OpenMLDB/blob/main/hybridse/src/udf/default_udf_library.cc)). ```c++ -CheckUdf("function_name", expect_result, arg_value,...); -``` +RegisterExternalTemplate("abs") + .doc(R"( + @brief Return the absolute value of expr. -For each function signature, we at least have to: + Example: -- Add a unit test with a normal result -- Add a unit test with a null result if the result is **nullable** - -**Example**: + @code{.sql} -```c++ -// month(timestamp) normal check -TEST_F(UdfIRBuilderTest, month_timestamp_udf_test) { - Timestamp time(1589958000000L); - CheckUdf("month", 5, time); -} + SELECT ABS(-32); + -- output 32 -// date(timestamp) normal check -TEST_F(UdfIRBuilderTest, timestamp_to_date_test_0) { - CheckUdf, Nullable>( - "date", codec::Date(2020, 05, 20), codec::Timestamp(1589958000000L)); -} -// date(timestamp) null check -TEST_F(UdfIRBuilderTest, timestamp_to_date_test_null_0) { - CheckUdf, Nullable>("date", nullptr, nullptr); -} -``` + @endcode -#### 2.3.2 Compile and Test + @param expr -```bash -cd ./hybridse -mkdir -p build -cd build -cmake .. -DCMAKE_BUILD_TYPE=Release -DTESTING_ENABLE=ON -make udf_ir_builder_test -j4 -SQL_CASE_BASE_DIR=${OPENMLDB_DIR} ./src/codegen/udf_ir_builder_test + @since 0.1.0)") + .args_in(); ``` +Development of generic template built-in scalar functions is similar to that of single data type built-in scalar functions. In this document, we won't delve into detailed discussions on generic template functions. The remaining content in this chapter primarily focuses on the development of single data type built-in scalar functions. - -## 3. Built-in Function Development Template +(sfunc_category)= +## 3.2 Built-in Scalar Function Development Template We classified built-in function into 3 types based on its return type: - SQL functions return **BOOL** or Numeric types, e.g., **SMALLINT**, **INT**, **BIGINT**, **FLOAT**, **DOUBLE** - -- SQL functions return **STRING**, **TIMESTAMP** or **DATE** - - - ```c++ - // SQL: STRING FUNC_STR(INT) - void func_output_str(int32_t, codec::StringRef*); - ``` - +- SQL functions return **STRING**, **TIMESTAMP**, **DATE**, **ArrayRef** - SQL functions return ***Nullable*** type Return types have a greater impact on the built-in function's behaviour. We will cover the details of the three types of SQL functions in the following sections. -### 3.1 SQL Functions Return **BOOL** or Numeric Types +(return_bool)= + +### 3.2.1 SQL Functions Return **BOOL** or Numeric Types If an SQL function returns a BOOL or Numeric type (e.g., **BOOL**, **SMALLINT**, **INT**, **BIGINT**, **FLOAT**, **DOUBLE**), then the C++ function should be designed to return the corresponding C++ type(`bool`, `int16_t`, `int32_t`, `int64_t`, `float`, `double`). @@ -351,7 +451,7 @@ RegisterExternal("my_func") )"); ``` -### 3.2 SQL Functions Return **STRING**, **TIMESTAMP** or **DATE** +### 3.2.2 SQL Functions Return **STRING**, **TIMESTAMP** or **DATE** If an SQL function returns **STRING**, **TIMESTAMP** or **DATE**, then the C++ function result should be returned in the parameter with the corresponding C++ pointer type (`codec::StringRef*`, `codec::Timestamp*`, `codec::Date*`). @@ -359,26 +459,31 @@ Thus the C++ function can be declared and implemented as follows: ```c++ # hybridse/src/udf/udf.h -namespace udf { - namespace v1 { - void func(Arg1 arg1, Arg2 arg2, ..., Ret* result); - } // namespace v1 -} // namespace udf +namespace hybridse { + namespace udf { + namespace v1 { + void func(Arg1 arg1, Arg2 arg2, ..., Ret* result); + } + } +} + ``` ```c++ # hybridse/src/udf/udf.cc -namespace udf { - namespace v1 { - void func(Arg1 arg1, Arg2 arg2, ..., Ret* ret) { - // ... - // *ret = result value +namespace hybridse { + namespace udf { + namespace v1 { + void func(Arg1 arg1, Arg2 arg2, ..., Ret* ret) { + // ... + // *ret = result value + } } - } // namespace v1 -} // namespace udf + } +} ``` -Configure and register the function into `DefaultUdfLibary` in[hybridse/src/udf/default_udf_library.cc](https://github.com/4paradigm/OpenMLDB/blob/main/hybridse/src/udf/default_udf_library.cc): +Configure and register the function into `DefaultUdfLibary` in[hybridse/src/udf/default_udf_library.cc](https://github.com/4paradigm/OpenMLDB/blob/main/hybridse/src/udf/default_udf_library.cc). Note that if the function needs to return through parameter, `return_by_arg(true)` needs to be configured. ```c++ # hybridse/src/udf/default_udf_library.cc @@ -390,7 +495,7 @@ RegisterExternal("my_func") )"); ``` -### 3.3 SQL Functions Return ***Nullable*** type +### 3.2.3 SQL Functions Return ***Nullable*** type If an SQL function return type is ***Nullable***, then we need one more `bool*` parameter to return a `is_null` flag. @@ -439,7 +544,9 @@ RegisterExternal("my_func") )"); ``` -### 3.4 SQL Functions Handle Nullable Argument +(arg_nullable)= + +### 3.2.4 SQL Functions Handle Nullable Argument Generally, OpenMLDB will return a ***NULL*** for a function when any one of its argurements is ***NULL***. @@ -488,11 +595,11 @@ RegisterExternal("my_func") )"); ``` -## 4. SQL Functions Development Examples +## 3.3. SQL Functions Development Examples -### 4.1 SQL Functions Return **BOOL** or Numeric Types: `INT Month(TIMESTAMP)` Function +### 3.3.1 SQL Functions Return **BOOL** or Numeric Types: `INT Month(TIMESTAMP)` Function -`INT Month(TIMESTAMP)` function returns the month for a given `timestamp`. Check [3.1 SQL functions return **BOOL** or Numeric types](#3.1-SQL functions return **BOOL** or Numeric types) for more details. +`INT Month(TIMESTAMP)` function returns the month for a given `timestamp`. Check [3.2.1 SQL functions return **BOOL** or Numeric types](return_bool) for more details. #### Step 1: Declare and Implement C++ Functions @@ -557,7 +664,7 @@ namespace udf { #### Step3: Function Unit Test -[Add unit tests](Add Unit Tests) in [src/codegen/udf_ir_builder_test.cc](https://github.com/4paradigm/OpenMLDB/blob/main/hybridse/src/codegen/udf_ir_builder_test.cc). Then [compile and test it](2.3.2 Compile and test). +Add unit test `TEST_F` in [src/codegen/udf_ir_builder_test.cc](https://github.com/4paradigm/OpenMLDB/blob/main/hybridse/src/codegen/udf_ir_builder_test.cc). Then [Compile and Test](compile_ut)。 ```c++ // month(timestamp) normal check @@ -567,7 +674,7 @@ TEST_F(UdfIRBuilderTest, month_timestamp_udf_test) { } ``` -Now, the `udf::v1:month` has been registered into the default library with the name `month`. As a result, we can call `month` in an SQL query while ignoring upper and lower cases. +Recompile `OpenMLDB` upon completing the development. Now, the `udf::v1:month` has been registered into the default library with the name `month`. As a result, we can call `month` in an SQL query while ignoring upper and lower cases. ```SQL select MONTH(TIMESTAMP(1590115420000)) as m1, month(timestamp(1590115420000)) as m2; @@ -578,9 +685,9 @@ select MONTH(TIMESTAMP(1590115420000)) as m1, month(timestamp(1590115420000)) as ---- ---- ``` -### 4.2 SQL Functions Return **STRING**, **TIMESTAMP** or **DATE** - `STRING String(BOOL)` +### 3.3.2 SQL Functions Return **STRING**, **TIMESTAMP** or **DATE** - `STRING String(BOOL)` -The `STRING String(BOOL)` function accepts a BOOL type input and converts it to an output of type STRING. Check [3.2 SQL functions return **STRING**, **TIMESTAMP** or **DATE**](#3.2-SQL functions return **STRING**, **TIMESTAMP** or **DATE**) for more details. +The `STRING String(BOOL)` function accepts a **BOOL** type input and converts it to an output of type STRING. Check [3.2.2 SQL functions return **STRING**, **TIMESTAMP** or **DATE**](#322sql-functions-return-string-timestamp-or-date) for more details. #### Step 1: Declare and Implement C++ Functions @@ -659,7 +766,7 @@ namespace hybridse { #### Step3: Function Unit Test -[Add unit tests](Add Unit Tests) in [src/codegen/udf_ir_builder_test.cc](https://github.com/4paradigm/OpenMLDB/blob/main/hybridse/src/codegen/udf_ir_builder_test.cc). Then [compile and test it](2.3.2 Compile and test). +Add unit tests `TEST_F` in [src/codegen/udf_ir_builder_test.cc](https://github.com/4paradigm/OpenMLDB/blob/main/hybridse/src/codegen/udf_ir_builder_test.cc). Then [Compile and Test](compile_ut). ```c++ // string(bool) normal check @@ -670,7 +777,7 @@ TEST_F(UdfIRBuilderTest, bool_to_string_test) { } ``` -Now, the `udf::v1:bool_to_string()` function has been registered into the default library with the name `string`. As a result, we can call `string` in an SQL query while ignoring upper and lower cases. +Recompile `OpenMLDB` upon completing the development. Now, the `udf::v1:bool_to_string()` function has been registered into the default library with the name `string`. As a result, we can call `String()` in an SQL query while ignoring upper and lower cases. ```SQL select STRING(true) as str_true, string(false) as str_false; @@ -682,14 +789,13 @@ select STRING(true) as str_true, string(false) as str_false; ``` +### 3.3.3 SQL Functions Return ***Nullable*** Type - `DATE Date(TIMESTAMP)` -### 4.3 SQL Functions Return ***Nullable*** Type - `DATE Date(TIMESTAMP)` - -`DATE Date(TIMESTAMP)()` function converts **TIMESTAMP** type to **DATE** type. Check [3.3 SQL functions return ***Nullable*** type](#3.3-SQL functions return ***Nullable*** type) and [3.2 SQL functions return **STRING**, **TIMESTAMP** or **DATE**](#3.2-SQL functions return **STRING**, **TIMESTAMP** or **DATE**) for more details. +`DATE Date(TIMESTAMP)()` function converts **TIMESTAMP** type to **DATE** type. Check [3.2.3 SQL functions return ***Nullable*** type](#323-sql-functions-return-nullable-type) and [3.2.2 SQL functions return **STRING**, **TIMESTAMP** or **DATE**](#322-sql-functions-return-string-timestamp-or-date) for more details. #### Step 1: Declare and Implement Built-In Functions -We implement a function `timestamp_to_date`to convert `timestamp` to the date type. The input is `timestamp` and the output is nullable `date` which is returned by arguments `codec::Date *output` and `bool *is_null`. +Due to the fact that the `date` type in OpenMLDB is a structured type, when designing functions, the result is not directly returned but is instead stored in the parameters for return. Additionally, considering that date conversions may encounter exceptions or failures, the return result is marked as ***nullable***. Therefore, an additional parameter, ***is_null***, is introduced to indicate whether the result is null or not. Declare the `timestamp_to_date()` function in [hybridse/src/udf/udf.h](https://github.com/4paradigm/OpenMLDB/blob/main/hybridse/src/udf/udf.h): @@ -731,9 +837,10 @@ namespace hybridse { #### Step 2: Register Built-In Function into Default Library -The following example registers the built-in function ` v1::timestamp_to_date` into the default library with the name `"date"`. +The configuration of the function name and function parameters is similar to that of regular functions. However, there are additional considerations for configuring the return value type: -Given the result is a nullable date type, we configure **return_by_arg** as ***true*** and return type as `Nullable`. +- Since the function result is stored in parameters for return, configure `return_by_arg(true)`. +- Since the function result may be null, configure `.returns>`. `DATE Date(TIMESTAMP)` is Date&Time function, developer should configure and register within `DefaultUdfLibrary::InitTimeAndDateUdf()` in [hybridse/src/udf/default_udf_library.cc](https://github.com/4paradigm/OpenMLDB/blob/main/hybridse/src/udf/default_udf_library.cc). @@ -763,7 +870,7 @@ namespace hybridse { #### Step3: Function Unit Test -[Add unit tests](Add Unit Tests) in [src/codegen/udf_ir_builder_test.cc](https://github.com/4paradigm/OpenMLDB/blob/main/hybridse/src/codegen/udf_ir_builder_test.cc). Then [compile and test it](2.3.2 Compile and test). +Add unit tests `TEST_F` in [src/codegen/udf_ir_builder_test.cc](https://github.com/4paradigm/OpenMLDB/blob/main/hybridse/src/codegen/udf_ir_builder_test.cc). Then [Compile and Test](compile_ut). ```c++ // date(timestamp) normal check @@ -777,7 +884,7 @@ TEST_F(UdfIRBuilderTest, timestamp_to_date_test_null_0) { } ``` -Now, the `udf::v1:timestamp_to_date` has been registered into the default library with the name `date`. As a result, we can call `date()` in an SQL query. +Recompile `OpenMLDB` upon completing the development. Now, the `udf::v1:timestamp_to_date` has been registered into the default library with the name `date`. As a result, we can call `date()` in an SQL query while ignoring upper and lower cases. ```SQL select date(timestamp(1590115420000)) as dt; @@ -788,14 +895,148 @@ select date(timestamp(1590115420000)) as dt; ------------ ``` +## 4. Aggregation Function Development + +### 4.1. Registration and Configuration of Interface + +#### 4.1.1 Registration of Aggregation Functions Supporting a Single Data Type + +The `DefaultUdfLibrary` provides the `RegisterUdaf` interface to facilitate the registration of built-in aggregation functions and initialize the function's registration name. This method requires specifying a data type and only supports declared data types. + + +```c++ +RegisterUdaf("register_func_name") + .templates() + .init("init_func_name", init_func_ptr, return_by_arg=false) + .update("update_func_name", update_func_ptr, return_by_arg=false) + .output("output_func_name", output_func_ptr, return_by_arg=false) +``` + + +Unlike the registration of scalar functions, aggregation functions require the registration of three functions: `init`, `update`, and `output`, which correspond to the initialization of the aggregation function, the update of intermediate states, and the output of the final result. +The configuration for these functions is as follows: + +- Configure parameter types: + - OUT: Output parameter type. + - ST: Intermediate state type. + - IN, ...: Input parameter types. +- Configure the `init` function pointer: `init_func_ptr`, with a function signature of `ST* Init()`. +- Configure the `update` function pointer: `update_func_ptr`, + - If the input is non-nullable, the function signature is `ST* Update(ST* state, IN val1, ...)`. + - If it is necessary to check whether the input is **Nullable**, this parameter can be configured as `Nullable`, and an additional `bool` parameter is added after the corresponding parameter in the function to store information about whether the parameter value is null. + The function signature is: `ST* Update(ST* state, IN val1, bool val1_is_null, ...)`. +- Configure the output function pointer: `output_func_ptr`. + When the function's return value may be null, an additional `bool*` parameter is required to store whether the result is null + (refer to [3.2.3 SQL Functions Return ***Nullable*** type](#323-sql-functions-return-nullable-type). + + +The following code demonstrates an example of adding a new aggregation function `second`. The `second` function returns the non-null second element in the aggregated data. For the sake of demonstration, the example supports only the `int32_t` data type: +```c++ +struct Second { + static std::vector* Init() { + auto list = new std::vector(); + return list; + } + + static std::vector* Update(std::vector* state, int32_t val, bool is_null) { + if (!is_null) { + state->push_back(val); + } + return state; + } + + static void Output(std::vector* state, int32_t* ret, bool* is_null) { + if (state->size() > 1) { + *ret = state->at(1); + *is_null = false; + } else { + *is_null = true; + } + delete state; + } +}; + +RegisterUdaf("second") + .templates, Opaque>, Nullable>() + .init("second_init", Second::Init) + .update("second_update", Second::Update) + .output("second_output", reinterpret_cast(Second::Output), true) + .doc(R"( + @brief Get the second non-null value of all values. + + @param value Specify value column to aggregate on. + + Example: + + |value| + |--| + |1| + |2| + |3| + |4| + @code{.sql} + SELECT second(value) OVER w; + -- output 2 + @endcode + @since 0.5.0 + )"); +``` + +#### 4.1.2 Registration of Aggregation Functions Supporting Generics +We also provide the `RegisterUdafTemplate` interface for registering an aggregation function that supports generics. + +```c++ +RegisterUdafTemplate("register_func_name") + .args_in() +``` + +- Configure the aggregation function template: `TemplateClass`. +- Configure all supported parameter types: `args_in`. + +The following code demonstrates an example of registering the `distinct_count` aggregation function. You can find the code in [hybridse/src/udf/default_udf_library.cc](https://github.com/4paradigm/OpenMLDB/blob/main/hybridse/src/udf/default_udf_library.cc). +```c++ +RegisterUdafTemplate("distinct_count") + .doc(R"( + @brief Compute number of distinct values. + + @param value Specify value column to aggregate on. + + Example: + + |value| + |--| + |0| + |0| + |2| + |2| + |4| + @code{.sql} + SELECT distinct_count(value) OVER w; + -- output 3 + @endcode + @since 0.1.0 + )") + .args_in(); +``` + +## 5. Example Code Reference + +### 5.1. Scalar Function Example Code Reference +For more scalar function example code, you can refer to: +[hybridse/src/udf/udf.cc](https://github.com/4paradigm/OpenMLDB/blob/main/hybridse/src/udf/udf.cc) + +### 5.2. Aggregation Function Example Code Reference +For more aggregation function example code, you can refer to: +[hybridse/src/udf/default_udf_library.cc](https://github.com/4paradigm/OpenMLDB/blob/main/hybridse/src/udf/default_udf_library.cc) -## 5. Document Management -Documents for all built-in functions can be found in [Built-in Functions](http://4paradigm.github.io/OpenMLDB/zh/main/reference/sql/udfs_8h.html). It is a markdown file automatically generated from source, so please do not edit it directly. +## 6. Documentation Management -- If you are adding a document for a new function, please refer to [2.2.4 Documenting Function](#224-documenting-function). -- If you are trying to revise a document of an existing function, you can find source code in the files of `hybridse/src/udf/default_udf_library.cc` or `hybridse/src/udf/default_defs/*_def.cc` . +Documentations for all built-in functions can be found in [Built-in Functions](../openmldb_sql/udfs_8h.md). It is a markdown file automatically generated from source, so please do not edit it directly. -There is a daily workflow that automatically converts the source code to a readable format, which are the contents inside the `docs/*/reference/sql/functions_and_operators` directory. The document website will also be updated accordingly. If you are interested in this process, you can refer to the source directory [udf_doxygen](https://github.com/4paradigm/OpenMLDB/tree/main/hybridse/tools/documentation/udf_doxygen). +- If you need to document the newly added functions, please refer to section [2.2.4 Documenting Function](#224-documenting-function) which explains that the documentation for built-in functions is managed in CPP source code. Subsequently, a series of steps will be taken to generate more readable documentation, which will appear in the `docs/*/openmldb_sql/` directory on the website. +- If you need to modify the documentation for an existing function, you can locate the corresponding documentation in the file `hybridse/src/udf/default_udf_library.cc` or `hybridse/src/udf/default_defs/*_def.cc` and make the necessary changes. +In the OpenMLDB project, a GitHub Workflow task is scheduled on a daily basis to regularly update the relevant documentation here. Therefore, modifications to the documentation for built-in functions only require changing the content in the corresponding source code locations as described above. The `docs` directory and the content on the website will be periodically updated accordingly. For details on the documentation generation process, you can check [udf_doxygen](https://github.com/4paradigm/OpenMLDB/tree/main/hybridse/tools/documentation/udf_doxygen). diff --git a/docs/en/developer/contributing.md b/docs/en/developer/contributing.md index a8112053565..86c0baafdcb 100644 --- a/docs/en/developer/contributing.md +++ b/docs/en/developer/contributing.md @@ -1,3 +1,26 @@ # Contributing Please refer to [Contribution Guideline](https://github.com/4paradigm/OpenMLDB/blob/main/CONTRIBUTING.md) +## Pull Request (PR) Guidelines + +When submitting a PR, please pay attention to the following points: +- PR Title: Please adhere to the [commit format](https://github.com/4paradigm/rfcs/blob/main/style-guide/commit-convention.md#conventional-commits-reference) for the PR title. **Note that this refers to the PR title, not the commits within the PR**. +```{note} +If the title does not meet the standard, `pr-linter / pr-name-lint (pull_request)` will fail with a status of `x`. +``` +- PR Checks: There are various checks in a PR, and only `codecov/patch` and `codecov/project` may not pass. Other checks should pass. If other checks do not pass and you cannot fix them or believe they should not be fixed, you can leave a comment in the PR. + +- PR Description: Please explain the intent of the PR in the first comment of the PR. We provide a PR comment template, and while you are not required to follow it, ensure that there is sufficient explanation. + +- PR Files Changed: Pay attention to the `files changed` in the PR. Do not include code changes outside the scope of the PR intent. You can generally eliminate unnecessary diffs by using `git merge origin/main` followed by `git push` to the PR branch. If you need assistance, leave a comment in the PR. +```{note} +If you are not modifying the code based on the main branch, when the PR intends to merge into the main branch, the `files changed` will include unnecessary code. For example, if the main branch is at commit 10, and you start from commit 9 of the old main, add new_commit1, and then add new_commit2 on top of new_commit1, you actually only want to submit new_commit2, but the PR will include new_commit1 and new_commit2. +In this case, just use `git merge origin/main` and `git push` to the PR branch to only include the changes. +``` +```{seealso} +If you want the branch code to be cleaner, you can avoid using `git merge` and use `git rebase -i origin/main` instead. It will add your changes one by one on top of the main branch. However, it will change the commit history, and you need `git push -f` to override the branch. +``` + +## Compilation Guidelines + +For compilation details, refer to the [Compilation Documentation](../deploy/compile.md). To avoid the impact of operating systems and tool versions, we recommend compiling OpenMLDB in a compilation image. Since compiling the entire OpenMLDB requires significant space, we recommend using `OPENMLDB_BUILD_TARGET` to specify only the parts you need. \ No newline at end of file diff --git a/docs/en/developer/index.rst b/docs/en/developer/index.rst index 755fa3873f9..d36c4913923 100644 --- a/docs/en/developer/index.rst +++ b/docs/en/developer/index.rst @@ -10,4 +10,3 @@ Developers built_in_function_develop_guide sdk_develop python_dev - udf_develop_guide diff --git a/docs/en/developer/python_dev.md b/docs/en/developer/python_dev.md index 1f18ede390c..43cf75f3f2f 100644 --- a/docs/en/developer/python_dev.md +++ b/docs/en/developer/python_dev.md @@ -2,9 +2,19 @@ There are two modules in `python/`: Python SDK and an OpenMLDB diagnostic tool. -## SDK Testing Methods +## SDK + +The Python SDK itself does not depend on the pytest and tox libraries used for testing. If you want to use the tests in the tests directory for testing, you can download the testing dependencies using the following method. + +``` +pip install 'openmldb[test]' +pip install 'dist/....whl[test]' +``` + +### Testing Method + +Run the command `make SQL_PYSDK_ENABLE=ON OPENMLDB_BUILD_TARGET=cp_python_sdk_so` under the root directory and make sure the library in `python/openmldb_sdk/openmldb/native/` was the latest native library. Testing typically requires connecting to an OpenMLDB cluster. If you haven't started a cluster yet, or if you've made code changes to the service components, you'll also need to compile the TARGET openmldb and start a onebox cluster. You can refer to the launch section of `steps/test_python.sh` for guidance. -Run the command `make SQL_PYSDK_ENABLE=ON OPENMLDB_BUILD_TARGET=cp_python_sdk_so` under the root directory and make sure the library in `python/openmldb_sdk/openmldb/native/` was the latest native library. 1. Package installation test: Install the compiled `whl`, then run `pytest tests/`. You can use the script `steps/test_python.sh` directly. 2. Dynamic test: Make sure there isn't OpenMLDB in `pip` or the compiled `whl`. Run `pytest test/` in `python/openmldb_sdk`, thereby you can easily debug. @@ -32,6 +42,11 @@ If the python log messages are required in all tests(even successful tests), ple pytest -so log_cli=true --log-cli-level=DEBUG tests/ ``` +You can also use the module mode for running tests, which is suitable for actual runtime testing. +``` +python -m diagnostic_tool.diagnose ... +``` + ## Conda If you use conda, `pytest` may found the wrong python, then get errors like `ModuleNotFoundError: No module named 'IPython'`. Please use `python -m pytest`. diff --git a/docs/en/developer/sdk_develop.md b/docs/en/developer/sdk_develop.md index 00c5edf7725..20246500520 100644 --- a/docs/en/developer/sdk_develop.md +++ b/docs/en/developer/sdk_develop.md @@ -9,22 +9,19 @@ The OpenMLDB SDK can be divided into several layers, as shown in the figure. The The bottom layer is the SDK core layer, which is implemented as [SQLClusterRouter](https://github.com/4paradigm/OpenMLDB/blob/b6f122798f567adf2bb7766e2c3b81b633ebd231/src/sdk/sql_cluster_router.h#L110). It is the core layer of **client**. All operations on OpenMLDB clusters can be done by using the methods of `SQLClusterRouter` after proper configuration. Three core methods of this layer that developers may need to use are: - 1. [ExecuteSQL](https://github.com/4paradigm/OpenMLDB/blob/b6f122798f567adf2bb7766e2c3b81b633ebd231/src/sdk/sql_cluster_router.h#L160) supports the execution of all SQL commands, including DDL, DML and DQL. 2. [ExecuteSQLParameterized](https://github.com/4paradigm/OpenMLDB/blob/b6f122798f567adf2bb7766e2c3b81b633ebd231/src/sdk/sql_cluster_router.h#L166)supports parameterized SQL. 3. [ExecuteSQLRequest](https://github.com/4paradigm/OpenMLDB/blob/b6f122798f567adf2bb7766e2c3b81b633ebd231/src/sdk/sql_cluster_router.h#L156)is the special methods for the OpenMLDB specific execution mode: [Online Request mode](../tutorial/modes.md#4-the-online-request-mode). - +Other methods, such as CreateDB, DropDB, DropTable, have not been removed promptly due to historical reasons. Developers don't need to be concerned about them. ### Wrapper Layer -Due to the complexity of the implementation of the SDK Layer, we didn't develop the Java and Python SDKs from scratch, but to use Java and Python to call the **SDK Layer**. Specifically, we made a wrapper layer using Swig. +Due to the complexity of the implementation of the SDK Layer, we didn't develop the Java and Python SDKs from scratch, but to use Java and Python to call the **SDK Layer**. Specifically, we made a wrapper layer using swig. Java Wrapper is implemented as [SqlClusterExecutor](https://github.com/4paradigm/OpenMLDB/blob/main/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/impl/SqlClusterExecutor.java). It is a simple wrapper of `sql_router_sdk`, including the conversion of input types, the encapsulation of returned results, the encapsulation of returned errors. Python Wrapper is implemented as [OpenMLDBSdk](https://github.com/4paradigm/OpenMLDB/blob/main/python/openmldb/sdk/sdk.py). Like the Java Wrapper, it is a simple wrapper as well. - - ### User Layer Although the Wrapper Layer can be used directly, it is not convenient enough. So, we develop another layer, the User Layer of the Java/Python SDK. @@ -36,7 +33,8 @@ The Python User Layer supports the `sqlalchemy`. See [sqlalchemy_openmldb](https We want an easier to use C++ SDK which doesn't need a Wrapper Layer. Therefore, in theory, developers only need to design and implement the user layer, which calls the SDK layer. -However, in consideration of code reuse, the SDK Layer code may be changed to some extent, or the core SDK code structure may be adjusted (for example, exposing part of the SDK Layer header file, etc.). + +However, in consideration of code reuse, the SDK Layer code may be changed to some extent, or the core SDK code structure may be adjusted (for example, exposing part of the SDK Layer header file, etc.). ## Details of SDK Layer @@ -48,7 +46,6 @@ The first two methods are using two options, which create a server connecting Cl ``` These two methods, which do not expose the metadata related DBSDK, are suitable for ordinary users. The underlayers of Java and Python SDK also use these two approaches. - Another way is to create based on DBSDK: ``` explicit SQLClusterRouter(DBSDK* sdk); @@ -85,4 +82,18 @@ If you only want to run JAVA testing, try the commands below: ``` mvn test -pl openmldb-jdbc -Dtest="SQLRouterSmokeTest" mvn test -pl openmldb-jdbc -Dtest="SQLRouterSmokeTest#AnyMethod" -``` \ No newline at end of file +``` + +### batchjob test + +batchjob tests can be done using the following method: +``` +$SPARK_HOME/bin/spark-submit --master local --class com._4paradigm.openmldb.batchjob.ImportOfflineData --conf spark.hadoop.hive.metastore.uris=thrift://localhost:9083 --conf spark.openmldb.zk.root.path=/openmldb --conf spark.openmldb.zk.cluster=127.0.0.1:2181 openmldb-batchjob/target/openmldb-batchjob-0.6.5-SNAPSHOT.jar load_data.txt true +``` + +Alternatively, you can copy the compiled openmldb-batchjob JAR file to the `lib` directory of the task manager in the OpenMLDB cluster. Then, you can use the client or Taskmanager Client to send commands for testing. + +When using Hive as a data source, make sure the metastore service is available. For local testing, you can start the metastore service in the Hive directory with the default address being `thrift://localhost:9083`. +``` +bin/hive --service metastore +``` diff --git a/docs/en/developer/udf_develop_guide.md b/docs/en/developer/udf_develop_guide.md deleted file mode 100644 index 4c5aff6d2e1..00000000000 --- a/docs/en/developer/udf_develop_guide.md +++ /dev/null @@ -1,216 +0,0 @@ -# UDF Function Development Guideline -## 1. Background -Although there are already hundreds of built-in functions, they can not satisfy the needs in some cases. In the past, this could only be done by developing new built-in functions. Built-in function development requires a relatively long cycle because it needs to recompile binary files and users have to wait for new version release. -In order to help users to quickly develop computing functions that are not provided by OpenMLDB, we develop the mechanism of user dynamic registration function. OpenMLDB will load the compiled library contains user defined function when executing `Create Function` statement. - -SQL functions can be categorised into scalar functions and aggregate functions. An introduction to scalar functions and aggregate functions can be seen [here](./built_in_function_develop_guide.md). -## 2. Development Procedures -### 2.1 Develop UDF functions -#### 2.1.1 Naming Specification of C++ Built-in Function -- The naming of C++ built-in function should follow the [snake_case](https://en.wikipedia.org/wiki/Snake_case) style. -- The name should clearly express the function's purpose. -- The name of a function should not be the same as the name of a built-in function or other custom functions. The list of all built-in functions can be seen [here](../reference/sql/udfs_8h.md). - -#### 2.1.2 -The types of the built-in C++ functions' parameters should be BOOL, NUMBER, TIMESTAMP, DATE, or STRING. -The SQL types corresponding to C++ types are shown as follows: - -| SQL Type | C/C++ Type | -|:----------|:------------| -| BOOL | `bool` | -| SMALLINT | `int16_t` | -| INT | `int32_t` | -| BIGINT | `int64_t` | -| FLOAT | `float` | -| DOUBLE | `double` | -| STRING | `StringRef` | -| TIMESTAMP | `Timestamp` | -| DATE | `Date` | - - -#### 2.1.3 Parameters and Return Values - -**Return Value**: - -* If the output type of the UDF is a basic type and not support null, it will be processed as a return value. -* If the output type of the UDF is a basic type and support null, it will be processed as function parameter. -* If the output type of the UDF is STRING, TIMESTAMP or DATE, it will return through the last parameter of the function. - -**Parameters**: - -* If the parameter is a basic type, it will be passed by value. -* If the output type of the UDF is STRING, TIMESTAMP or DATE, it will be passed by pointer. -* The first parameter must be `UDFContext* ctx`. The definition of [UDFContext](../../../include/udf/openmldb_udf.h) is: - -```c++ - struct UDFContext { - ByteMemoryPool* pool; // Used for memory allocation. - void* ptr; // Used for the storage of temporary variables for aggregrate functions. - }; -``` - -**Note**: -- if the input value is nullable, there are added `is_null` parameter to lable whether is null -- if the return value is nullable, it should be return by argument and add another `is_null` parameter - -For instance, declare a UDF function that input is nullable and return value is nullable. -```c++ -extern "C" -void sum(::openmldb::base::UDFContext* ctx, int64_t input1, bool is_null, int64_t input2, bool is_null, int64_t* output, bool* is_null); -``` - -**Function Declaration**: - -* The functions must be declared by extern "C". - -#### 2.1.4 Memory Management - -- It is not allowed to use `new` operator or `malloc` function to allocate memory for input and output argument in UDF functions. -- If you use `new` operator or `malloc` function to allocate memory for UDFContext::ptr in UDAF init functions, it need to be freed in output function mannually. -- If you need to request additional memory space dynamically, please use the memory management interface provided by OpenMLDB. OpenMLDB will automatically free the memory space after the function is executed. - -```c++ - char *buffer = ctx->pool->Alloc(size); -``` - -- The maximum size of the space allocated at a time cannot exceed 2M bytes. - - -#### 2.1.5 Implement the UDF Function -- The head file `udf/openmldb_udf.h` should be included. -- Develop the logic of the function. - -```c++ -#include "udf/openmldb_udf.h" // The headfile - -// Develop a UDF which slices the first 2 characters of a given string. -extern "C" -void cut2(::openmldb::base::UDFContext* ctx, ::openmldb::base::StringRef* input, ::openmldb::base::StringRef* output) { - if (input == nullptr || output == nullptr) { - return; - } - uint32_t size = input->size_ <= 2 ? input->size_ : 2; - //To apply memory space in UDF functions, please use ctx->pool. - char *buffer = ctx->pool->Alloc(size); - memcpy(buffer, input->data_, size); - output->size_ = size; - output->data_ = buffer; -} -``` - - -#### 2.1.5 Implement the UDAF Function -- The head file `udf/openmldb_udf.h` should be included. -- Develop the logic of the function. - -It need to develop three functions as below: -- init function. do some init works in this function such as alloc memory or init variables. The function name should be "xxx_init" -- update function. Update the aggretrate value. The function name should be "xxx_update" -- output function. Extract the aggregrate value and return. The function name should be "xxx_output" - -**Node**: It should return `UDFContext*` as return value in init and update function. - -```c++ -#include "udf/openmldb_udf.h" - -extern "C" -::openmldb::base::UDFContext* special_sum_init(::openmldb::base::UDFContext* ctx) { - // allocte memory by memory poll - ctx->ptr = ctx->pool->Alloc(sizeof(int64_t)); - // init the value - *(reinterpret_cast(ctx->ptr)) = 10; - // return the pointer of UDFContext - return ctx; -} - -extern "C" -::openmldb::base::UDFContext* special_sum_update(::openmldb::base::UDFContext* ctx, int64_t input) { - // get the value from ptr in UDFContext - int64_t cur = *(reinterpret_cast(ctx->ptr)); - cur += input; - *(reinterpret_cast(ctx->ptr)) = cur; - // return the pointer of UDFContext - return ctx; -} - -// get the result from ptr in UDFcontext and return -extern "C" -int64_t special_sum_output(::openmldb::base::UDFContext* ctx) { - return *(reinterpret_cast(ctx->ptr)) + 5; -} - -``` - - -For more UDF implementation, see [here](../../../src/examples/test_udf.cc). - - -### 2.2 Compile the Dynamic Library - -- Copy the `include` directory (`https://github.com/4paradigm/OpenMLDB/tree/main/include`) to a certain path (like `/work/OpenMLDB/`) for later compiling. -- Run the compiling command. `-I` specifies the path of `include` directory. `-o` specifies the name of the dynamic library. - -```shell -g++ -shared -o libtest_udf.so examples/test_udf.cc -I /work/OpenMLDB/include -std=c++17 -fPIC -``` - -### 2.3 Copy the Dynamic Library -The compiled dynamic libraries should be copied into the `udf` directories for both TaskManager and tablets. Please create a new `udf` directory if it does not exist. -- The `udf` directory of a tablet is `path_to_tablet/udf`. -- The `udf` directory of TaskManager is `path_to_taskmanager/taskmanager/bin/udf`. - -For example, if the deployment paths of a tablet and TaskManager are both `/work/openmldb`, the structure of the directory is shown below: - -``` - /work/openmldb/ - ├── bin - ├── conf - ├── taskmanager - │   ├── bin - │   │   ├── taskmanager.sh - │   │   └── udf - │   │   └── libtest_udf.so - │   ├── conf - │   └── lib - ├── tools - └── udf -    └── libtest_udf.so -``` - -```{note} -- Note that, for multiple tablets, the library needs to be copied to every one. -- Moreover, dynamic libraries should not be deleted before the execution of `DROP FUNCTION`. -``` - - -### 2.4 Register, Drop and Show the Functions -For registering, please use [CREATE FUNCTION](../reference/sql/ddl/CREATE_FUNCTION.md). -```sql -CREATE FUNCTION cut2(x STRING) RETURNS STRING OPTIONS (FILE='libtest_udf.so'); -``` - -Create an udaf function that input value and return value support null. -```sql -CREATE AGGREGATE FUNCTION third(x BIGINT) RETURNS BIGINT OPTIONS (FILE='libtest_udf.so', ARG_NULLABLE=true, RETURN_NULLABLE=true); -``` - -```{note} -- The types of parameters and return values must be consistent with the implementation of the code. -- `FILE` specifies the file name of the dynamic library. It is not necessary to include a path. -- A UDF function can only work on one type. Please create multiple functions for multiple types. -``` - -After successful registration, the function can be used. -```sql -SELECT cut2(c1) FROM t1; -``` - -You can view registered functions through `SHOW FUNCTIONS`. -```sql -SHOW FUNCTIONS; -``` - -Please use the `DROP FUNCTION` to delete a registered function. -```sql -DROP FUNCTION cut2; -``` diff --git a/docs/en/faq/client_faq.md b/docs/en/faq/client_faq.md new file mode 100644 index 00000000000..7cd29df9877 --- /dev/null +++ b/docs/en/faq/client_faq.md @@ -0,0 +1,102 @@ +# Client FAQ + +## "fail to get tablet ..." + +Prioritize the checking on whether the tablet server in the cluster is unexpectedly offline or if online tables are not readable or writable. It is recommended to use [openmldb_tool](../maintain/diagnose.md) for diagnosis, utilizing the `status` (status --diff), and `inspect online` commands for checks. +If the TODO diag tool detects abnormal conditions in offline or online tables, it will output warnings and suggest the next steps. +If manual inspection is required, follow these two steps: +- Execute `show components` to check if the server is in the list. If TaskManager is offline, it will not be on the list. If Tablet is offline, it will be in the list but with a status of offline. If there are offline servers, **restart the server and add it back to the cluster**. +- Execute `show table status like '%'` (if the lower version does not support `like`, query the system db and user db separately). Check if the "Warnings" for each table report any errors. + +Common errors may include messages like `real replica number X does not match the configured replicanum X`. For detailed error information, please refer to [SHOW TABLE STATUS](../openmldb_sql/ddl/SHOW_TABLE_STATUS.md). These errors indicate that the table is currently experiencing issues, rendering it unable to provide normal read and write functions. This is typically due to Tablet issues. + +## Why Do I Receive Warnings of "Reached timeout ..."? +``` +rpc_client.h:xxx] request error. [E1008] Reached timeout=xxxms +``` +This is because the timeout setting for the RPC request sent by the client-side is too short, and the client-side actively disconnects. Note that this is the timeout for RPC. You need to change the general `request_timeout` configuration. + +1. CLI: Configure `--request_timeout_ms` at startup. +2. JAVA/Python SDK: Adjust `SdkOption.requestTimeout` in Option or URL. + +```{note} +This error usually does not occur with synchronized offline commands, as the timeout of the synchronized offline command is set to the maximum acceptable time for TaskManager. +``` + +## Why Do I Receive Warnings of "Got EOF of Socket ..."? +``` +rpc_client.h:xxx] request error. [E1014]Got EOF of Socket{id=x fd=x addr=xxx} (xx) +``` +This is because the `addr` end is actively disconnected, and the `addr` address is most likely the TaskManager. This does not mean that the TaskManager is abnormal, but rather that the TaskManager end considers this connection inactive, exceeds the `keepAliveTime`, and actively disconnects the communication channel. + +In versions 0.5.0 and later, you can increase the `server.channel_keep_alive_time` of the TaskManager to improve tolerance for inactive channels. The default value is 1800 seconds (0.5 hours). Especially when using synchronous offline commands, this value may need to be adjusted appropriately. + +In versions before 0.5.0, this configuration cannot be changed. Please upgrade the TaskManager version. + +## Why is the Offline Query Result Displaying Chinese Characters as Garbled? + +When using offline queries, there may be issues with garbled query results containing Chinese characters. This is mainly related to the system's default encoding format and the encoding format parameters of Spark tasks. + +If you encounter garbled characters, you can resolve this by adding the Spark advanced parameters `spark.driver.extraJavaOptions=-Dfile.encoding=utf-8` and `spark.executor.extraJavaOptions=-Dfile.encoding=utf-8`. + +For client configuration methods, you can refer to the [Spark Client Configuration](../reference/client_config/client_spark_config.md), or you can add this configuration to the TaskManager configuration file. + +``` +spark.default.conf=spark.driver.extraJavaOptions=-Dfile.encoding=utf-8;spark.executor.extraJavaOptions=-Dfile.encoding=utf-8 +``` + +## How to Configure TaskManager to Access a YARN Cluster with Kerberos Enabled? + +If the YARN cluster has Kerberos authentication enabled, TaskManager can access the YARN cluster with Kerberos authentication by adding the following configuration. Please note to modify the `keytab` path and `principal` account according to the actual configuration. + +``` +spark.default.conf=spark.yarn.keytab=/tmp/test.keytab;spark.yarn.principal=test@EXAMPLE.COM +``` + +## How to Configure Client's Core Logs? + +Client core logs mainly consist of two types: ZooKeeper logs and SDK logs (glog logs), and they are independent of each other. + +ZooKeeper Logs: + +1. CLI: Configure `--zk_log_level` during startup to adjust the log level, and use `--zk_log_file` to specify the log file. +2. JAVA/Python SDK: Use `zkLogLevel` to adjust the level and `zkLogFile` to specify the log file in Option or URL. + +- `zk_log_level` (int, default=0, i.e., DISABLE_LOGGING): Prints logs at this level and **below**. 0 - disable all zk logs, 1 - error, 2 - warn, 3 - info, 4 - debug. + +SDK Logs (glog Logs): + +1. CLI: Configure `--glog_level` during startup to adjust the level, and use `--glog_dir` to specify the log file. +2. JAVA/Python SDK: Use `glogLevel` to adjust the level and `glogDir` to specify the log file in Option or URL. + +- `glog_level` (int, default=1, i.e., WARNING): Prints logs at this level and **above**. INFO, WARNING, ERROR, and FATAL logs correspond to 0, 1, 2, and 3, respectively. + + +## Insert Error with Log `please use getInsertRow with ... first`. + +When using `InsertPreparedStatement` for insertion in the JAVA client or inserting with SQL and parameters in Python, there is an underlying cache effect in the client. The process involves generating SQL cache with the first step `getInsertRow` and returning the SQL along with the parameter information to be completed. The second step actually executes the insert, and it requires using the SQL cache cached in the first step. Therefore, when multiple threads use the same client, it's possible that frequent updates to the cache table due to frequent insertions and queries might evict the SQL cache you want to execute, causing it to seem like the first step `getInsertRow` was not executed. + +Currently, you can avoid this issue by increasing the `maxSqlCacheSize` configuration option. This option is only supported in the JAVA/Python SDKs. + +## Offline Command Error + +``` +java.lang.OutOfMemoryError: Java heap space +``` + +``` +Container killed by YARN for exceeding memory limits. 5 GB of 5 GB physical memory used. Consider boosting spark.yarn.executor.memoryOverhead. +``` + +When encountering the aforementioned log messages, it indicates that the offline task requires more resources than the current configuration provides. This typically occurs in the following situations: + +- The Spark configuration for the offline command is set to `local[*]`, the machine has a high number of cores, and the concurrency is too high, resulting in excessive resource consumption. +- The memory configuration is too small. + +If using local mode and the resources on a single machine are limited, consider reducing concurrency. If you choose not to reduce concurrency, adjust the `spark.driver.memory` and `spark.executor.memory` Spark configuration options. You can write these configurations in the `conf/taskmanager.properties` file in the TaskManager's running directory, restart the TaskManager, or use the CLI client for configuration. For more information, refer to the [Spark Client Configuration](../reference/client_config/client_spark_config.md). + +``` +spark.default.conf=spark.driver.memory=16g;spark.executor.memory=16g +``` + +When the master is local, adjust the memory of the driver, not the executor. If you are unsure, you can adjust both. diff --git a/docs/en/faq/index.rst b/docs/en/faq/index.rst new file mode 100644 index 00000000000..a5d1e94a540 --- /dev/null +++ b/docs/en/faq/index.rst @@ -0,0 +1,10 @@ +============================= +FAQ +============================= + + +.. toctree:: + :maxdepth: 1 + + client_faq + server_faq diff --git a/docs/en/faq/server_faq.md b/docs/en/faq/server_faq.md new file mode 100644 index 00000000000..c7fd80d94f0 --- /dev/null +++ b/docs/en/faq/server_faq.md @@ -0,0 +1,64 @@ +# Server FAQ + +If there are any changes or issues in the server, please first check the command `openmldb_tool status` and `inspect online` to see if the cluster is functioning properly. + +## Deployment FAQ + +### 1. How to confirm that the cluster is running normally? +Although there is a one-click startup script, due to the numerous configurations, issues such as "port already in use" or "directory without read-write permissions" may occur. These problems are usually discovered only after the server process has started, and there is no immediate feedback upon exit. (If monitoring is configured, you can check directly through the monitoring system.) + +Therefore, please first confirm that all server processes in the cluster are running normally. + +You can use `ps aux | grep openmldb` or the SQL command `show components;` to check. (Note that if you use a daemon, the openmldb server process may be in a loop of starting and stopping, which does not necessarily mean continuous operation. You can confirm this through logs or the connection time from `show components;`.) + +If the processes are alive, but the cluster is still behaving abnormally, it is necessary to check the server logs. Priority should be given to examining WARN and ERROR level logs, as they often indicate the root cause. + +### 2. What if the data is not automatically restored successfully? + +In normal circumstances, when we restart the service, the data in the table will automatically be recovered. However, there are cases where recovery may fail, and common failure scenarios include: + +- Tablet abnormal exit +- Multiple-replica tables have multiple replicas in tablets that restart simultaneously or too quickly, causing some `auto_failover` operations not to complete before the tablet restarts +- `auto_failover` set to `false` + +After the service has successfully started, you can use `gettablestatus` to obtain the status of all tables: + +``` +python tools/openmldb_ops.py --openmldb_bin_path=./bin/openmldb --zk_cluster=172.24.4.40:30481 --zk_root_path=/openmldb --cmd=gettablestatus +``` + +If there are `Warnings` in the table, you can use `recoverdata` to automatically recover the data: +``` +python tools/openmldb_ops.py --openmldb_bin_path=./bin/openmldb --zk_cluster=172.24.4.40:30481 --zk_root_path=/openmldb --cmd=recoverdata +``` + +## Server FAQ + +### 1. Why is there a warning log for 'Failed to write into Socket' in the log? +``` +http_rpc_protocol.cpp:911] Fail to write into Socket{id=xx fd=xx addr=xxx} (0x7a7ca00): Unknown error 1014 [1014] +``` +This is the log that the server will print. Usually, if the client-side uses connection pooling or short connection mode, the connection will be closed after RPC timeout. When the server writes back the response, it will report this error if it finds that the connection has already been closed. `Got EOF` refers to receiving an EOF before (when the other end has closed the connection normally). When using single connection mode on the client side, the server side generally does not report this. + +### 2. The initial setting of TTL for table data is not appropriate. How to adjust it? +This requires the use of nsclient to make the modification, and a regular client cannot achieve this. The startup method and commands for nsclient can be found in [NS Client](../maintain/cli.md#ns-client). + +In nsclient, you can use the `setttl` command to change the ttl for a table, similar to: + +``` +setttl table_name ttl_type ttl [ttl] [index_name] +``` +As can be seen, if the name of the index is configured at the end of the command, it is possible to only modify the ttl of a single index. +```{caution} +Changes made with `setttl` will not take effect immediately and will be influenced by the `gc_interval` configuration of the tablet server. (The configuration is independent for each tablet server and does not affect others.) + +For example, if the `gc_interval` for a tablet server is set to 1 hour, then the ttl configuration reload will take place at the end of the next gc (in the worst case, it will reload after 1 hour). During this gc, the data eviction will not follow the latest ttl. It will only use the updated ttl in the subsequent gc. + +Therefore, **after changing ttl, you need to wait for two cycles of the gc interval for it to take effect**. Please be patient. + +Of course, you can adjust the `gc_interval` for the tablet server, but this configuration cannot be changed dynamically and takes effect only after a restart. So, if there is significant memory pressure, consider scaling or migrating data shards to reduce memory pressure. It is not recommended to adjust `gc_interval` lightly. +``` + +### 3. A warning log appears: "Last Join right table is empty", what does this mean? +Generally speaking, this is a normal phenomenon and does not represent cluster anomalies. The right table of the join in the runner may be empty, which is likely due to data issues. + diff --git a/docs/en/integration/deploy_integration/index.rst b/docs/en/integration/deploy_integration/index.rst index 15bff333619..edc057efc88 100644 --- a/docs/en/integration/deploy_integration/index.rst +++ b/docs/en/integration/deploy_integration/index.rst @@ -1,5 +1,5 @@ ============================= -dispatch +Dispatch ============================= .. toctree:: diff --git a/docs/en/integration/index.rst b/docs/en/integration/index.rst index 023bd3c9ab9..074131cf88a 100644 --- a/docs/en/integration/index.rst +++ b/docs/en/integration/index.rst @@ -1,5 +1,5 @@ ============================= -Upstream and downstream ecology +Upstream and Downstream Ecology ============================= .. toctree:: diff --git a/docs/en/integration/offline_data_sources/hive.md b/docs/en/integration/offline_data_sources/hive.md index 6784df026a2..5e41f5d66dd 100644 --- a/docs/en/integration/offline_data_sources/hive.md +++ b/docs/en/integration/offline_data_sources/hive.md @@ -102,7 +102,7 @@ Importing data from Hive sources is facilitated through the API [`LOAD DATA INFI - Both offline and online engines are capable of importing data from Hive sources. - The Hive data import feature supports soft connections. This approach minimizes the need for redundant data copies and ensures that OpenMLDB can access Hive's most up-to-date data at any given time. To activate the soft link mechanism for data import, utilize the `deep_copy=false` parameter. -- The `OPTIONS` parameter offers two valid settings: `deep_copy`, `mode` and `sql`. +- The `OPTIONS` parameter offers three valid settings: `deep_copy`, `mode` and `sql`. For example: @@ -122,7 +122,7 @@ LOAD DATA INFILE 'hive://db1.t1' INTO TABLE db1.t1 OPTIONS(deep_copy=true, sql=' Exporting data to Hive sources is facilitated through the API [`SELECT INTO`](../../openmldb_sql/dql/SELECT_INTO_STATEMENT.md), which employs a distinct URI format, `hive://[db].table`, to seamlessly transfer data to the Hive data warehouse. Here are some key considerations: -- If you omit specifying a database name, the default database name used will be `default_Db`. +- If you omit specifying Hive database name, the default database used in Hive will be `default`. - When a database name is explicitly provided, it's imperative that the database already exists. Currently, the system does not support the automatic creation of non-existent databases. - In the event that the designated Hive table name is absent, the system will automatically generate a table with the corresponding name within the Hive environment. - The `OPTIONS` parameter exclusively takes effect within the export mode of `mode`. Other parameters do not exert any influence. diff --git a/docs/en/integration/offline_data_sources/iceberg.md b/docs/en/integration/offline_data_sources/iceberg.md new file mode 100644 index 00000000000..5a46b93e932 --- /dev/null +++ b/docs/en/integration/offline_data_sources/iceberg.md @@ -0,0 +1,104 @@ +# Iceberg + +## Introduction + +[Apache Iceberg](https://iceberg.apache.org/) is an open table format for huge analytic datasets. Iceberg adds tables to compute engines including Spark, Trino, PrestoDB, Flink, Hive and Impala using a high-performance table format that works just like a SQL table. OpenMLDB supports the use of Iceberg as an offline storage engine for importing data and exporting feature computation data. + +## Configuration + +### Installation + +For users employing [The OpenMLDB Spark Distribution Version](../../tutorial/openmldbspark_distribution.md), specifically v0.8.5 and newer iterations, the essential Iceberg 1.4.3 dependencies are already integrated. If you are working with an alternative Spark distribution or different iceberg version, you can download the corresponding Iceberg dependencies from the [Iceberg release](https://iceberg.apache.org/releases/) and add them to the Spark classpath/jars. For example, if you are using OpenMLDB Spark, you should download `x.x.x Spark 3.2_12 runtime Jar`(x.x.x is iceberg version) and add it to `jars/` in Spark home. + +### Configuration + +You should add the catalog configuration to the Spark configuration. This can be accomplished in two ways: + +- taskmanager.properties(.template): Include iceberg configs within the `spark.default.conf` configuration item, followed by restarting the taskmanager. +- CLI: Integrate this configuration directive into ini conf and use `--spark_conf` when start CLI. Please refer to [Client Spark Configuration](../../reference/client_config/client_spark_config.md). + +Iceberg config details can be found in [Iceberg Configuration](https://iceberg.apache.org/docs/latest/spark-configuration/). + +For example, set hive catalog in `taskmanager.properties(.template)`: + +```properties +spark.default.conf=spark.sql.catalog.hive_prod=org.apache.iceberg.spark.SparkCatalog;spark.sql.catalog.hive_prod.type=hive;spark.sql.catalog.hive_prod.uri=thrift://metastore-host:port +``` + +If you need to create iceberg tables, you also need to configure `spark.sql.catalog.hive_prod.warehouse`. + +Set hadoop catalog: + +```properties +spark.default.conf=spark.sql.catalog.hadoop_prod=org.apache.iceberg.hadoop.HadoopCatalog;spark.sql.catalog.hadoop_prod.type=hadoop;spark.sql.catalog.hadoop_prod.warehouse=hdfs://hadoop-namenode:port/warehouse +``` + +Set rest catalog: + +```properties +spark.default.conf=spark.sql.catalog.rest_prod=org.apache.iceberg.spark.SparkCatalog;spark.sql.catalog.rest_prod.catalog-impl=org.apache.iceberg.rest.RESTCatalog;spark.sql.catalog.rest_prod.uri=http://iceberg-rest:8181/ +``` + +The full configuration of the iceberg catalog see [Iceberg Catalog Configuration](https://iceberg.apache.org/docs/latest/spark-configuration/). + +### Debug Information + +When you import data from Iceberg, you can check the task log to confirm whether the task read the source data. +``` +INFO ReaderImpl: Reading ORC rows from +``` +TODO + +## Data Format + +Iceberg schema see [Iceberg Schema](https://iceberg.apache.org/spec/#schema). Currently, it only supports the following Iceberg data format: + +| OpenMLDB Data Format | Iceberg Data Format | +| -------------------- | ------------------- | +| BOOL | bool | +| INT | int | +| BIGINT | long | +| FLOAT | float | +| DOUBLE | double | +| DATE | date | +| TIMESTAMP | timestamp | +| STRING | string | + +## Import Iceberg Data to OpenMLDB + +Importing data from Iceberg sources is facilitated through the API [`LOAD DATA INFILE`](../../openmldb_sql/dml/LOAD_DATA_STATEMENT.md). This operation employs a specialized URI format, `hive://[db].table`, to seamlessly import data from Iceberg. Here are some important considerations: + +- Both offline and online engines are capable of importing data from Iceberg sources. +- The Iceberg data import feature supports soft connections. This approach minimizes the need for redundant data copies and ensures that OpenMLDB can access Iceberg's most up-to-date data at any given time. To activate the soft link mechanism for data import, utilize the `deep_copy=false` parameter. +- The `OPTIONS` parameter offers three valid settings: `deep_copy`, `mode` and `sql`. + +For example, load data from Iceberg configured as hive catalog: + +```sql +LOAD DATA INFILE 'iceberg://hive_prod.db1.t1' INTO TABLE t1 OPTIONS(deep_copy=false); +-- or +LOAD DATA INFILE 'hive_prod.db1.t1' INTO TABLE t1 OPTIONS(deep_copy=false, format='iceberg'); +``` + +The data loading process also supports using SQL queries to filter specific data from Hive tables. It's important to note that the SQL syntax must comply with SparkSQL standards. The table name used should be the registered name without the `iceberg://` prefix. + +For example: + +```sql +LOAD DATA INFILE 'iceberg://hive_prod.db1.t1' INTO TABLE db1.t1 OPTIONS(deep_copy=true, sql='SELECT * FROM hive_prod.db1.t1 where key=\"foo\"') +``` + +## Export OpenMLDB Data to Iceberg + +Exporting data to Iceberg sources is facilitated through the API [`SELECT INTO`](../../openmldb_sql/dql/SELECT_INTO_STATEMENT.md), which employs a distinct URI format, `iceberg://[catalog].[db].table`, to seamlessly transfer data to the Iceberg data warehouse. Here are some key considerations: + +- If you omit specifying Iceberg database name, the default database used in Iceberg will be `default`. +- When Iceberg database name is explicitly provided, it's imperative that the database already exists. Currently, the system does not support the automatic creation of non-existent databases. +- In the event that the designated Iceberg table name is absent, the system will automatically generate a table with the corresponding name within the Hive environment. +- The `OPTIONS` parameter exclusively takes effect within the export mode of `mode`. Other parameters do not exert any influence. + +For example: + +```sql +SELECT col1, col2, col3 FROM t1 INTO OUTFILE 'iceberg://hive_prod.db1.t1'; +``` diff --git a/docs/en/integration/offline_data_sources/index.rst b/docs/en/integration/offline_data_sources/index.rst index 51f877f29bc..9947828577d 100644 --- a/docs/en/integration/offline_data_sources/index.rst +++ b/docs/en/integration/offline_data_sources/index.rst @@ -6,4 +6,5 @@ Offline Data Source :maxdepth: 1 hive - s3 \ No newline at end of file + s3 + iceberg \ No newline at end of file diff --git a/docs/en/integration/online_datasources/index.rst b/docs/en/integration/online_datasources/index.rst index 7b2232ef05b..a84d1d406b3 100644 --- a/docs/en/integration/online_datasources/index.rst +++ b/docs/en/integration/online_datasources/index.rst @@ -1,5 +1,5 @@ ============================= -online data source +Online Data Source ============================= .. toctree:: diff --git a/docs/en/openmldb_sql/dml/DELETE_STATEMENT.md b/docs/en/openmldb_sql/dml/DELETE_STATEMENT.md index 60914052bfd..08d8349aa36 100644 --- a/docs/en/openmldb_sql/dml/DELETE_STATEMENT.md +++ b/docs/en/openmldb_sql/dml/DELETE_STATEMENT.md @@ -11,7 +11,6 @@ TableName ::= ``` **Description** -- `DELETE` statement will delete data fulfilling specific requirements in online table, not all data from the index. Only index related to where condition will be deleted. For more examples please check [function_boundary](../../quickstart/function_boundary.md#delete). - The filter columns specified by `WHERE` must be an index column. if it is a key column, only `=` can be used. ## Examples diff --git a/docs/en/quickstart/cli.md b/docs/en/quickstart/cli.md index 878ccf8fe60..4e4c195cc5b 100644 --- a/docs/en/quickstart/cli.md +++ b/docs/en/quickstart/cli.md @@ -34,6 +34,10 @@ Below we will describe some commonly used configuration options. - zk_session_timeout: The expected ZooKeeper session timeout is not necessarily the actual session timeout. If the value is set too large, ZooKeeper Server's tickTime or maxSessionTimeout also needs to be adjusted. +- user: Specify the username for login. If not specified, it defaults to 'root'. + +- password: Specify the password for login. If not specified, you'll be prompted to enter the password in interactive mode. + ## Non-Interactive Usage The interface that appears after starting the CLI is called an interactive interface. You need to enter SQL statements and press Enter to execute operations. Here are some non-interactive usage methods for batch processing or debugging. diff --git a/docs/en/quickstart/function_boundary.md b/docs/en/quickstart/function_boundary.md index 9c2c0b7ae14..b9d55864874 100644 --- a/docs/en/quickstart/function_boundary.md +++ b/docs/en/quickstart/function_boundary.md @@ -70,63 +70,6 @@ It is recommended to use HDFS files as source data. This approach allows for suc - In local mode, TaskManager can successfully import source data only if the source data is placed on the same host as the TaskManager process. - When TaskManager is in Yarn mode (both client and cluster), a file path cannot be used as the source data address because it is not known on which host the container is running. -### DELETE - -In tables with multiple indexes in the online storage, a `DELETE` operation may not delete corresponding data in all indexes. Consequently, there may be situations where data has been deleted, but the deleted data can still be found. - -For example: - -```SQL -create database db; -use db; -create table t1(c1 int, c2 int,index(key=c1),index(key=c2)); -desc t1; -set @@execute_mode='online'; -insert into t1 values (1,1),(2,2); -delete from t1 where c2=2; -select * from t1; -select * from t1 where c2=2; -``` - -The results are as follows: - -```Plain - --- ------- ------ ------ --------- - Field Type Null Default - --- ------- ------ ------ --------- - 1 c1 Int YES - 2 c2 Int YES - --- ------- ------ ------ --------- - --- -------------------- ------ ---- ------ --------------- - name keys ts ttl ttl_type - --- -------------------- ------ ---- ------ --------------- - 1 INDEX_0_1668504212 c1 - 0min kAbsoluteTime - 2 INDEX_1_1668504212 c2 - 0min kAbsoluteTime - --- -------------------- ------ ---- ------ --------------- - -------------- - storage_mode - -------------- - Memory - -------------- - ---- ---- - c1 c2 - ---- ---- - 1 1 - 2 2 - ---- ---- - -2 rows in set - ---- ---- - c1 c2 - ---- ---- - -0 rows in set -``` - -Explanation: - -Table `t1` has multiple indexes (which may be automatically created during `DEPLOY`). If you run `delete from t1 where c2=2`, it only deletes data in the second index, while the data in the first index remains unaffected. Therefore, if you subsequently run `select * from t1` and it uses the first index, there are two pieces of data that haven't been deleted. `select * from t1 where c2=2` uses the second index, and the result is empty, with data being successfully deleted. - ## DQL Boundary The supported query modes (i.e. `SELECT` statements) vary depending on the execution mode: diff --git a/docs/en/quickstart/sdk/java_sdk.md b/docs/en/quickstart/sdk/java_sdk.md index 8934c55abf0..584488eff6c 100644 --- a/docs/en/quickstart/sdk/java_sdk.md +++ b/docs/en/quickstart/sdk/java_sdk.md @@ -53,6 +53,9 @@ Connection connection = DriverManager.getConnection("jdbc:openmldb:///?zk=localh // Set database in jdbcUrl Connection connection1 = DriverManager.getConnection("jdbc:openmldb:///test_db?zk=localhost:6181&zkPath=/openmldb"); + +// Set user and password in jdbcUrl +Connection connection = DriverManager.getConnection("jdbc:openmldb:///?zk=localhost:6181&zkPath=/openmldb&user=root&password=123456"); ``` The database specified in the Connection address must exist when creating the connection. @@ -113,6 +116,10 @@ option.setZkCluster("127.0.0.1:2181"); option.setZkPath("/openmldb"); option.setSessionTimeout(10000); option.setRequestTimeout(60000); +// If not specified, it defaults to 'root' +option.setUser("root"); +// If not specified, it defaults to being empty +option.setPassword("123456"); ``` Then, use SdkOption to create the Executor. diff --git a/docs/en/quickstart/sdk/python_sdk.md b/docs/en/quickstart/sdk/python_sdk.md index 6ae0e4705af..625cadc015e 100644 --- a/docs/en/quickstart/sdk/python_sdk.md +++ b/docs/en/quickstart/sdk/python_sdk.md @@ -21,6 +21,8 @@ Parameter `db_name` name must exist, and the database must be created before the ```python import openmldb.dbapi db = openmldb.dbapi.connect(zk="$zkcluster", zkPath="$zkpath") +# You can set the username and password as follows. If no username is set, it defaults to 'root', and the password defaults to being empty +# db = openmldb.dbapi.connect(zk="$zkcluster", zkPath="$zkpath", user="$user", password="$password") cursor = db.cursor() ``` @@ -124,6 +126,8 @@ Parameter `db_name` must exist, and the database must be created before the conn ```python import sqlalchemy as db engine = db.create_engine('openmldb:///?zk=127.0.0.1:2181&zkPath=/openmldb') +# You can set the username and password as follows. +# create_engine('openmldb:///db_name?zk=zkcluster&zkPath=zkpath&user=root&password=123456') connection = engine.connect() ``` diff --git a/docs/en/reference/sql/ddl/ALTER_USER_STATEMENT.md b/docs/en/reference/sql/ddl/ALTER_USER_STATEMENT.md new file mode 100644 index 00000000000..c1b764cc670 --- /dev/null +++ b/docs/en/reference/sql/ddl/ALTER_USER_STATEMENT.md @@ -0,0 +1,45 @@ +# ALTER USER + +The `ALTER USER` statement is used to modify a user's password. + +## Syntax +```sql +AlterUserstmt ::= + 'ALTER' 'USER' [IF EXISTS] UserName SET OptOptionsList + +UserName ::= Identifier + +OptOptionsList ::= + "OPTIONS" OptionList + +OptionList ::= + OptionsListPrefix ")" + +OptionsListPrefix ::= + "(" OptionEntry + | OptionsListPrefix "," OptionEntry + +OptionEntry ::= + Identifier "=" Identifier +``` + +## **Examples** +```sql +ALTER USER user1; +-- SUCCEED +ALTER USER IF EXISTS user2 SET OPTIONS(password='123456'); +-- SUCCEED +ALTER USER user3 SET OPTIONS (password='123456'); +-- SUCCEED +``` + +```{note} +1. If the password is not specified in the OPTIONS, the password will not be changed +2. You can only specify the password in the OPTIONS +``` + +## Related SQL + +[CREATE USER](./CREATE_USER_STATEMENT.md) +[DROP USER](./DROP_USER_STATEMENT.md) +[SHOW CURRENT_USER](./SHOW_CURRENT_USER_STATEMENT.md) \ No newline at end of file diff --git a/docs/en/reference/sql/ddl/CREATE_USER_STATEMENT.md b/docs/en/reference/sql/ddl/CREATE_USER_STATEMENT.md new file mode 100644 index 00000000000..fa169f8fd55 --- /dev/null +++ b/docs/en/reference/sql/ddl/CREATE_USER_STATEMENT.md @@ -0,0 +1,45 @@ +# CREATE USER + +The `CREATE USER` statement is used to create a user + +## Syntax +```sql +CreateUserstmt ::= + 'CREATE' 'USER' [IF NOT EXISTS] UserName OptOptionsList + +UserName ::= Identifier + +OptOptionsList ::= + "OPTIONS" OptionList + +OptionList ::= + OptionsListPrefix ")" + +OptionsListPrefix ::= + "(" OptionEntry + | OptionsListPrefix "," OptionEntry + +OptionEntry ::= + Identifier "=" Identifier +``` + +## **Examples** +```sql +CREATE USER user1; +-- SUCCEED +CREATE USER IF NOT EXISTS user2; +-- SUCCEED +CREATE USER user3 OPTIONS (password='123456'); +-- SUCCEED +``` + +```{note} +1. Only the password can be specified in the OPTIONS +2. The password will be empty if not specified explicitly +``` + +## Related SQL + +[DROP USER](./DROP_USER_STATEMENT.md) +[ALTER USER](./ALTER_USER_STATEMENT.md) +[SHOW CURRENT_USER](./SHOW_CURRENT_USER_STATEMENT.md) \ No newline at end of file diff --git a/docs/en/reference/sql/ddl/DROP_USER_STATEMENT.md b/docs/en/reference/sql/ddl/DROP_USER_STATEMENT.md new file mode 100644 index 00000000000..2d4d0ce2db7 --- /dev/null +++ b/docs/en/reference/sql/ddl/DROP_USER_STATEMENT.md @@ -0,0 +1,29 @@ +# DROP USER + +The `DROP USER` statement is used to drop a user. + +## Syntax +```sql +DropUserstmt ::= + 'DROP' 'USER' [IF EXISTS] UserName + +UserName ::= Identifier +``` + +## **Examples** +```sql +DROP USER user1; +-- SUCCEED +DROP USER IF EXISTS user2; +-- SUCCEED +``` + +```{note} +1. The user `root` cannot be deleted +``` + +## Related SQL + +[CREATE USER](./CREATE_USER_STATEMENT.md) +[ALTER USER](./ALTER_USER_STATEMENT.md) +[SHOW CURRENT_USER](./SHOW_CURRENT_USER_STATEMENT.md) \ No newline at end of file diff --git a/docs/en/reference/sql/ddl/SHOW_CURRENT_USER_STATEMENT.md b/docs/en/reference/sql/ddl/SHOW_CURRENT_USER_STATEMENT.md new file mode 100644 index 00000000000..20de9171fa8 --- /dev/null +++ b/docs/en/reference/sql/ddl/SHOW_CURRENT_USER_STATEMENT.md @@ -0,0 +1,17 @@ +# SHOW CURRENT_USER + +The `SHOW CURRENT_USER` statement is used to display the current user. + +## **Examples** +```sql +SHOW CURRENT_USER; + ------ + User + ------ + root + ------ +``` + +[CREATE USER](./CREATE_USER_STATEMENT.md) +[ALTER USER](./ALTER_USER_STATEMENT.md) +[DROP USER](./DROP_USER_STATEMENT.md) \ No newline at end of file diff --git a/docs/en/tutorial/index.rst b/docs/en/tutorial/index.rst index fbbd84eda26..4815ca95cb2 100644 --- a/docs/en/tutorial/index.rst +++ b/docs/en/tutorial/index.rst @@ -5,12 +5,14 @@ Tutorials .. toctree:: :maxdepth: 1 - standalone_vs_cluster - modes + data_import_guide tutorial_sql_1 tutorial_sql_2 - data_import openmldbspark_distribution + data_import + data_export autofe - common_architecture + standalone_vs_cluster + standalone_use + app_arch online_offline_sync diff --git a/docs/en/tutorial/tutorial_sql_1.md b/docs/en/tutorial/tutorial_sql_1.md index b6bcd5530e8..c94df66c086 100644 --- a/docs/en/tutorial/tutorial_sql_1.md +++ b/docs/en/tutorial/tutorial_sql_1.md @@ -1,7 +1,7 @@ # SQL for Feature Extraction (Part 1) -## 1. The Feature Engineering of Machine Learning +## 1. Feature Engineering for Machine Learning A real-world machine learning application generally includes two main processes, namely **Feature Engineering** and **Machine Learning Model** (hereinafter referred to as **Model**). We must know a lot about the model, from the classic logistic regression and decision tree models to the deep learning models, we all focus on how to develop high-quality models. We may pay less attention to feature engineering. However, as the saying goes, data and features determine the upper limit of machine learning, while models and algorithms only approach this limit. It can be seen that we have long agreed on the importance of Feature Engineering. @@ -59,7 +59,7 @@ For example, the following user transaction table (hereinafter referred as data | trans_type | STRING | Transaction Type | | province | STRING | Province | | city | STRING | City | -| label | BOOL | Sample label, true\|false | +| label | BOOL | Sample label, `true` or `flase` | In addition to the primary table, there may also be tables storing relevant auxiliary information in the database, which can be combined with the primary table through the JOIN operation. These tables are called **Secondary Tables** (note that there may be multiple secondary tables). For example, we can have a secondary table storing the merchants' history flow. In the process of feature engineering, more valuable information can be obtained by combining the primary and secondary tables. The feature engineering over multiple tables will be introduced in detail in the [next part](tutorial_sql_2.md) of this series. @@ -143,6 +143,7 @@ Important parameters include: - The lower bound time must be `>=` the upper bound time. - The lower bound row must follow the upper bound row. +For more features, pleaes referr to [documentation](../openmldb_sql/dql/WHERE_CLAUSE.md). #### Example @@ -150,9 +151,9 @@ For the transaction table T1 shown above, we define two `ROWS_RANGE` windows and ![img](images/table_t1.png) -Note that the following window definitions are not completed SQL. We will add aggregate functions later to complete runnable SQL. +Note that the following window definitions are not completed SQL. We will add aggregate functions to complete runnable SQL. (See [3.3.2](332-step-2constructfeaturesbasedontimewindow)) -- w1d: the window within the most recent day +**w1d: the window within the most recent day** The window of the user's most recent day containing the rows from the current to the most recent day ```sql window w1d as (PARTITION BY uid ORDER BY trans_time ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW) @@ -160,14 +161,14 @@ window w1d as (PARTITION BY uid ORDER BY trans_time ROWS_RANGE BETWEEN 1d PRECED The `w1d` window shown in the above figure is for the partition `id=9`, and the `w1d` window contains three rows (`id=6`, `id=8`, `id=9`). These three rows fall in the time window [2022-02-07 12:00:00, 2022-02-08 12:00:00] . -- w1d_10d: the window from 1 day ago to the last 10 days +**w1d_10d: the window from 1 day ago to the last 10 days** ```sql window w1d_10d as (PARTITION BY uid ORDER BY trans_time ROWS_RANGE BETWEEN 10d PRECEDING AND 1d PRECEDING) ``` The window `w1d_10d` for the partition `id=9` contains three rows, which are `id=1`, `id=3` and `id=4`. These three rows fall in the time window of [2022-01-29 12:00:00, 2022-02-07 12:00:00]。 -- w0_1: the window contains the last 0 ~ 1 rows +**w0_1: the window contains the last 0 ~ 1 rows** The window contains the last 0 ~ 1 rows, including the previous line and the current line. ```sql window w0_1 as (PARTITION BY uid ORDER BY trans_time ROWS BETWEEN 1 PRECEDING AND CURRENT ROW) @@ -175,7 +176,7 @@ window w0_1 as (PARTITION BY uid ORDER BY trans_time ROWS BETWEEN 1 PRECEDING AN The window `w0_1` for the partition `id=10` contains 2 rows, which are `id=7` and `id=10`. -- w2_10: the window contains the last 2 ~ 10 rows +**w2_10: the window contains the last 2 ~ 10 rows** ```sql window w2_10 as (PARTITION BY uid ORDER BY trans_time ROWS BETWEEN 10 PRECEDING AND 2 PRECEDING) @@ -304,7 +305,7 @@ window w30d as (PARTITION BY uid ORDER BY trans_time ROWS_RANGE BETWEEN 30d PREC We make frequency statistics for a given column as we may need to know the type of the highest frequency, the proportion of the type with the largest number, etc., in each category. -`top1_ratio`: Find out the type with the largest number and compute the proportion of its number in the window. +**`top1_ratio`**: Find out the type with the largest number and compute the proportion of its number in the window. The following SQL uses `top1_ratio` to find out the city with the most transactions in the last 30 days and compute the proportion of the number of transactions of the city to the total number of transactions in t1. ```sql @@ -314,7 +315,7 @@ FROM t1 window w30d as (PARTITION BY uid ORDER BY trans_time ROWS_RANGE BETWEEN 30d PRECEDING AND CURRENT ROW); ``` -`topn_frequency(col, top_n)`: Find the `top_n` categories with the highest frequency in the window +**`topn_frequency(col, top_n)`**: Find the `top_n` categories with the highest frequency in the window The following SQL uses `topn_frequency` to find out the top 2 cities with the highest number of transactions in the last 30 days in t1. ```sql diff --git a/docs/en/tutorial/tutorial_sql_2.md b/docs/en/tutorial/tutorial_sql_2.md index bb69147c065..cc7ab8261ad 100644 --- a/docs/en/tutorial/tutorial_sql_2.md +++ b/docs/en/tutorial/tutorial_sql_2.md @@ -63,7 +63,7 @@ As shown below, left table `LAST JOIN` right table with `ORDER BY` and right tab ## 3. Multi-Row Aggregation over Multiple Tables -For aggregation over multiple tables, OpenMLDB extends the standard WINDOW syntax and adds [WINDOW UNION](../reference/sql/dql/WINDOW_CLAUSE.md#window-union) syntax. +For aggregation over multiple tables, OpenMLDB extends the standard WINDOW syntax and adds [WINDOW UNION](../openmldb_sql/dql/WINDOW_CLAUSE.md#1-window--union) syntax. WINDOW UNION supports combining multiple pieces of data from the secondary table to form a window on secondary table. Based on the time window, it is convenient to construct the multi-row aggregation feature of the secondary table. Similarly, two steps need to be completed to construct the multi-row aggregation feature of the secondary table: @@ -122,10 +122,10 @@ Among them, necessary elements include: - Lower bound time must be > = Upper bound time - The row number of lower bound must be < = The row number of upper bound - `INSTANCE_NOT_IN_WINDOW`: It indicates that except for the current row, other data in the main table will not enter the window. -- For more syntax and features, please refer to [OpenMLDB WINDOW UNION Reference Manual](../reference/sql/dql/WINDOW_CLAUSE.md). +- For more syntax and features, please refer to [OpenMLDB WINDOW UNION Reference Manual](../openmldb_sql/sql/dql/WINDOW_CLAUSE.md). ``` -### Example +#### Example Let's see the usage of WINDOW UNION through specific examples. @@ -166,7 +166,7 @@ PARTITION BY mid ORDER BY purchase_time ROWS_RANGE BETWEEN 10d PRECEDING AND 1 PRECEDING INSTANCE_NOT_IN_WINDOW) ``` -## 3.2 Step 2: Build Multi-Row Aggregation Feature of Sub Table +### 3.2 Step 2: Build Multi-Row Aggregation Feature of Sub Table Apply the multi-row aggregation function on the created window to construct aggregation features on multi-rows of secondary table, so that the number of rows finally generated is the same as that of the main table. For example, we can construct features from the secondary table like: the total retail sales of merchants in the last 10 days `w10d_merchant_purchase_amt_sum` and the total consumption times of the merchant in the last 10 days `w10d_merchant_purchase_count`. diff --git a/docs/en/use_case/airflow_provider_demo.md b/docs/en/use_case/airflow_provider_demo.md deleted file mode 100644 index 9019ba2c5a6..00000000000 --- a/docs/en/use_case/airflow_provider_demo.md +++ /dev/null @@ -1,123 +0,0 @@ -# Airflow OpenMLDB Provider -We provide the [Airflow OpenMLDB Provider](https://github.com/4paradigm/OpenMLDB/tree/main/extensions/airflow-provider-openmldb) to use the OpenMLDB in Airflow DAG more easily. -This manual will use the Airflow to manage the training and deployment tasks in the [TalkingData Demo](talkingdata_demo). - -## TalkingData DAG - -We will use the DAG created by [example_openmldb_complex.py](https://github.com/4paradigm/OpenMLDB/blob/main/extensions/airflow-provider-openmldb/openmldb_provider/example_dags/example_openmldb_complex.py) in the Airflow. -You can import the DAG into the Airflow and run it directly. -![airflow dag](images/airflow_dag.png) - -The workflow of the DAG is shown above. The tables will be created at first, then the offline data will be imported and processed for feature extraction. After training, if the AUC of the model is greater than 99.0, then the SQL script and the model can be deployed. Otherwise, the workflow will report failure. - -## Demo - -The DAG mentioned above will be used to complete the feature extraction and deployment work in the [TalkingData Demo](talkingdata_demo), and the predict_server in this demo is responsible for the real-time prediction after deployment. - -### 0 Preparations - -#### 0.1 Download DAG -Both the DAG and the training script can be gained by downloading [airflow_demo_files](https://openmldb.ai/download/airflow_demo/airflow_demo_files.tar.gz). - -``` -wget https://openmldb.ai/download/airflow_demo/airflow_demo_files.tar.gz -tar zxf airflow_demo_files.tar.gz -ls airflow_demo_files -``` -For the newest version, please visit [GitHub example_dags](https://github.com/4paradigm/OpenMLDB/tree/main/extensions/airflow-provider-openmldb/openmldb_provider/example_dags). - - -#### 0.2 Start the Docker Image - -- It is recommended to install and start the OpenMLDB image and the Airflow in Docker. -- The port of the container needs to be exposed for the Airflow Web login. -- Please project the previously downloaded files to the path `/work/airflow/dags`, where Airflow will access for the DAG. - -``` -docker run -p 8080:8080 -v `pwd`/airflow_demo_files:/work/airflow/dags -it 4pdosc/openmldb:0.8.4 bash -``` - -#### 0.3 Download and Install the Airflow and the Airflow OpenMLDB Provider -Run the following command in Docker. -``` -pip3 install airflow-provider-openmldb -``` -Since the Airflow OpenMLDB Provider relies on the Airflow, they will be downloaded together. - -#### 0.4 Prepare the Dataset -Since the data import path of the DAG is `/tmp/train_sample.csv`, we have to copy the data file to `/tmp` directory. -``` -cp /work/talkingdata/train_sample.csv /tmp/ -``` - -### 1 Start the OpenMLDB and the Airflow -The following commands will start the OpenMLDB cluster. The `predict_server` supports deployment and test, and the standalone Airflow. - -``` -/work/init.sh -python3 /work/talkingdata/predict_server.py --no-init > predict.log 2>&1 & -export AIRFLOW_HOME=/work/airflow -cd /work/airflow -airflow standalone -``` - -The username and the password for the Airflow standalone are shown in the picture below. - -![airflow login](images/airflow_login.png) - -Please visit `http://localhost:8080`, enter the username and the password as shown. - -```{caution} -`Airflow standalone` is a foreground process, the exit will lead to the whole termiantion of the process. -You can quit the Airflow after the DAG is finished then go for [Step 3](#3-test) or just put the Airflow process to the background. -``` - -### 2 Run the DAG -Open the DAG example_openmldb_complex in the Airflow Web and click the `Code` to check the detail of the DAG. -![dag home](images/dag_home.png) - -You can see the `openmldb_conn_id` that is used in `Code`. The DAG doesn't use the address of OpenMLDB but use the connection. We need to create a new connection and name it the same. -![dag code](images/dag_code.png) - -#### 2.1 Create the Connection -Click the 'connection' in the 'Admin'. -![connection](images/connection.png) - -Add a connection. -![add connection](images/add_connection.png) - -Please use the address of the OpenMLDB Api Server rather than the address of zookeeper as the Airflow OpenMLDB Provider is connected to the OpenMLDB Api Server. - -![connection settings](images/connection_settings.png) - -The created connection is shown as the picture below. -![display](images/connection_display.png) - -#### 2.2 Run the DAG -Run the DAG to complete a turn of model training, SQL and model deployment. -A successful run should look like the following figure. -![dag run](images/dag_run.png) - -### 3 Test - -If you run the Airflow foreground, you can quit the Airflow as the subsequent procedures do not depend on it. -#### 3.1 Import the Online Data -Although the DAG has deployed the SQL and the model, there is no data in the online database. -You should run the following command to import the online data. -``` -curl -X POST http://127.0.0.1:9080/dbs/example_db -d'{"mode":"online", "sql":"load data infile \"file:///tmp/train_sample.csv\" into table example_table options(mode=\"append\");"}' -``` -This is an asynchronous operation, but it won't take too long because of the small data size. -If you want to check the execution state of the command, please use `SHOW JOBS`. -``` -curl -X POST http://127.0.0.1:9080/dbs/example_db -d'{"mode":"online", "sql":"show jobs"}' -``` - -#### 3.2 Prediction -Run the following prediction script which will use the latest deployed SQL and model. -``` -python3 /work/talkingdata/predict.py -``` -The result is shown below. -![result](images/airflow_test_result.png) - diff --git a/docs/en/use_case/dolphinscheduler_task_demo.md b/docs/en/use_case/dolphinscheduler_task_demo.md deleted file mode 100644 index 5a4a8e6bfb8..00000000000 --- a/docs/en/use_case/dolphinscheduler_task_demo.md +++ /dev/null @@ -1,215 +0,0 @@ -# Building End-to-End MLOps Workflows (OpenMLDB + DolphinScheduler) - -## Background -In the closed loop of machine learning applications from development to deployment, data processing, feature engineering, and model training often cost a lot of time and manpower. To facilitate AI applications development and deployment, we have developed the DolphinScheduler OpenMLDB Task, which integrates feature engineering into the workflow of DolphinScheduler to build an end-to-end MLOps workflow. This article will briefly introduce and demonstrate the operation process of the DolphinScheduler OpenMLDB Task. - -```{seealso} -See [DolphinScheduler OpenMLDB Task Official Documentation](https://dolphinscheduler.apache.org/en-us/docs/3.1.5/guide/task/openmldb) for full details. -``` - -## Scenarios and Functions -### Why We Need the DolphinScheduler OpenMLDB Task - -![image-20220610170510779](../../zh/use_case/images/ecosystem.png) - -As an open-source machine learning database that provides full-stack solutions for data and feature engineering, the key point for OpenMLDB is to improve ease-of-use and integrate the open-source ecosystem. As shown in the above figure, accessing the data source can make it easier for the data in DataOps to feed into OpenMLDB, and the features provided by OpenMLDB also need to smoothly enter ModelOps for training. - -In this article, we focus on the integration with the workflow scheduler platform DolphinScheduler. The DolphinScheduler OpenMLDB Task can operate OpenMLDB more easily. At the same time, the OpenMLDB task is also managed by workflow and is fully automated. - -### What Can the DolphinScheduler OpenMLDB Task Do - -By writing the OpenMLDB task, we can meet the requirements of OpenMLDB for offline import, feature extraction, SQL deployment, real-time data import, etc. We can build an end-to-end machine learning pipeline using OpenMLDB based on DolphinScheduler. - -![image-20220610174647990](../../zh/use_case/images/task_func.png) - -For example, the typical workflow of machine learning based on OpenMLDB is shown in the figure above, steps 1-4 in the process correspond to offline data import, feature extraction, SQL deployment, and real-time data import, which can be written through the DolphinScheduler OpenMLDB Task. - -In addition to the feature engineering done by OpenMLDB, the prediction also requires model inference. So next, based on the TalkingData advertising fraud detection scenario from the Kaggle competition, we will demonstrate how to use the DolphinScheduler OpenMLDB Task to build an end-to-end machine learning pipeline. For details of the TalkingData competition, see [talkingdata-adtracking-fraud-detection](https://www.kaggle.com/competitions/talkingdata-adtracking-fraud-detection/discussion). - -## Demo -### Configuration - -** Use OpenMLDB docker image** - -The demo can run on MacOS or Linux, the OpenMLDB docker image is recommended. We'll start OpenMLDB and DolphinScheduler in the same container, expose the DolphinScheduler web port: -``` -docker run -it -p 12345:12345 4pdosc/openmldb:0.8.4 bash -``` - -```{attention} -The DolphinScheduler requires a user of the operating system with `sudo` permission. Therefore, it is recommended to download and start the DolphinScheduler in the OpenMLDB container. Otherwise, please prepare the operating system user with sudo permission. -``` - -The docker image doesn't have sudo, but DolphinScheduler needs it in runtime. So install it: -``` -apt update && apt install sudo -``` - -And DolphinScheduler task running uses sh, but the docker image default sh is `dash`. Change it to `bash`: -``` -dpkg-reconfigure dash -``` -And enter `no`. - -**Source Data** - -The workflow will load data from `/tmp/train_sample.csv`,so prepare it: -``` -curl -SLo /tmp/train_sample.csv https://openmldb.ai/download/dolphinschduler-task/train_sample.csv -``` - -**Start OpenMLDB Cluster and Predict Server** - -In the container, you can directly run the following command to start the OpenMLDB cluster. -``` -./init.sh -``` - -We will complete a workflow of importing data, offline training, and deploying the SQL and model online after successful training. For the online part of the model, you can use the simple predict server in `/work/talkingdata`. Run it in the background: -``` -cd /work -curl -SLo predict_server.py https://openmldb.ai/download/dolphinschduler-task/predict_server.py -python3 predict_server.py --no-init > predict.log 2>&1 & -``` -```{tip} -If online predict test got errors, please check the log`/work/predict.log`. -``` - -**Start DolphinScheduler** - -You can download the DolphinScheduler package in [official](https://dolphinscheduler.apache.org/zh-cn/download/3.1.5), or the mirror site prepared by us, in[dolphinscheduler-bin download link](http://openmldb.ai/download/dolphinschduler-task/apache-dolphinscheduler-dev-3.1.5-bin.tar.gz). - -Start the DolphinScheduler standalone version. The steps are as follows. For more information, please refer to [Official Documentation](https://dolphinscheduler.apache.org/en-us/docs/3.1.5/guide/installation/standalone)。 -``` -curl -SLO https://dlcdn.apache.org/dolphinscheduler/3.1.5/apache-dolphinscheduler-3.1.5-bin.tar.gz -# mirror: curl -SLO http://openmldb.ai/download/dolphinschduler-task/apache-dolphinscheduler-dev-3.1.5-bin.tar.gz -tar -xvzf apache-dolpSchedulerler-*-bin.tar.gz -cd apache-dolpSchedulerler-*-bin -sed -i s#/opt/soft/python#/usr/bin/python3#g bin/env/dolphinscheduler_env.sh -sh ./bin/dolpSchedulerler-daemon.sh start standalone-server -``` - -```{hint} -The OpenMLDB Task in old version (< 3.1.2) has problems,can't work, please use the newer package(>=3.1.3). If you want the DolphinScheduler in old version, ask us for the fix version. - -In higher version of DolphinScheduler, `bin/env/dolphinscheduler_env.sh` may be changed, we need to append `PYTHON_HOME` to it, run `echo "export PYTHON_HOME=/usr/bin/python3" >> bin/env/dolphinscheduler_env.sh`. - -We have set the Python environment by modify `PYTHON_HOME` in `bin/env/dolphinscheduler_env.sh`, as shown in the previous code(Python Task needs to explicitly set the python environment, cuz we use Python3). If you have started the DolphinScheduler already, you can also set the environment on the web page after startup. The setting method is as follows. **Note that in this case, it is necessary to confirm that all tasks in the workflow use this environment** - -Note that before the DolphinScheduler standalone runs, the configured temporary environment variable `PYTHON_HOME` does not affect the environment in the work server. -``` - -Now you can login to DolphinScheduler at http://localhost:12345/dolphinscheduler/ui (If you access it by another machine, use the IP address). The default user name and password are: admin/dolphinscheduler123。 - -```{note} -The worker server of DolphinScheduler requires the OpenMLDB Python SDK. The worker of DolphinScheduler standalone is the local machine, so you only need to install the OpenMLDB Python SDK on the local machine. The Python SDK is ready in our OpenMLDB image. If you are not running the docker image, install the SDK by `pip3 install openmldb`. -``` - -**Download workflow json** - -Workflows can be created manually. In this example, we directly provide JSON workflow files, [Click to Download](http://openmldb.ai/download/dolphinschduler-task/workflow_openmldb_demo.json), and you can directly import it later into the DolphinScheduler environment and make simple modifications to complete the whole workflow. - -Note that, you should download the workflow file in the machine which you open the browser. We'll upload the file on web. - -### Demo Steps - -#### Step 1. Initialize Configuration - -You need to first create a tenant in the DolphinScheduler Web, and then enter the tenant management interface, fill in the operating system user with sudo permission, and use the default for the queue. You can use **root** if you run it in the docker container. - -![create tenant](../../zh/use_case/images/ds_create_tenant.png) - -Then you need to bind the tenant to the user. For simplicity, we directly bind to the admin user. Enter the user management page and click edit admin user. - -![bind tenant](../../zh/use_case/images/ds_bind_tenant.png) - -After binding, the user status is similar to the following figure. - -![bind status](../../zh/use_case/images/ds_bind_status.png) - -#### Step 2. Create Workflow -In the DolphinScheduler, you need to create a project first, and then create a workflow in the project. Therefore, first create a test project, as shown in the following figure. Click create a project and enter the project. - -![create project](../../zh/use_case/images/ds_create_project.png) - -![project](../../zh/use_case/images/ds_project.png) - -After entering the project, you can import the [downloaded workflow file](https://github.com/4paradigm/OpenMLDB/releases/download/v0.5.1/workflow_openmldb_demo.json). As shown in the following figure, please click Import workflow in the workflow definition interface. - -![import workflow](../../zh/use_case/images/ds_import_workflow.png) - -After the import, the workflow will appear in the workflow list, similar to the following figure. - -![workflow list](../../zh/use_case/images/ds_workflow_list.png) - -Then you click the workflow name to view the workflow details, as shown in the following figure. - -![workflow detail](../../zh/use_case/images/ds_workflow_detail.png) - -**Note**: This needs to be modified because the task ID will change after importing the workflow. In particular, the upstream and downstream id in the switch task do not exist and need to be manually changed. - -![switch](../../zh/use_case/images/ds_switch.png) - -As shown in the above figure, there is a non-existent ID in the settings of the switch task. Please change the successful and failed "branch flow" and "pre-check condition" to the task of the current workflow. - -The correct result is shown in the following figure: - -![right](../../zh/use_case/images/ds_switch_right.png) - -After modification, we save the workflow. Tenant in the imported workflow will be deemed as default in the default mode and also can be run. If you want to specify your tenant, please select a tenant when saving the workflow, as shown in the following figure. -![set tenant](../../zh/use_case/images/ds_set_tenant.png) - -#### Step 3. Online Operation - -After saving the workflow, you need to go online before running. The run button will not light up until it is online. As shown in the following figure. - -![run](../../zh/use_case/images/ds_run.png) - -Please click run and wait for the workflow to complete. You can view the workflow running details in the Workflow Instance interface, as shown in the following figure. - -![run status](../../zh/use_case/images/ds_run_status.png) - -To demonstrate the process of a successful launch, the validation does not perform actual validation, but directly returns the validation success and flows into the deploy branch. After running the deploy branch, the deploy SQL and subsequent tasks are successful, the predict server receives the latest model. - -```{note} -If the instance in Workflow Instance got `Failed`, click the instance name, jump to the detail page. Double click the failed task, and click `View log` in the in the top right-hand corner, check the log for detail error messages. - -`load offline data`, `feture extraction` and `load online` task may be succeed in DolphinScheduler, But the job is failed in OpenMLDB. So we may get the error 'No object to concatenate'(Traceback `pd.concat`) in `train` task, it means no feature source. - -If it's wrong, please check the real state of each jobs in OpenMLDB. You can run `echo "show jobs;" | /work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client`. If the job state is `FAILED`, find the job log. See [job log path](../../zh/quickstart/beginner_must_read.md#离线) to find it. -``` - -#### 4. online predict test -The predict server also provides online prediction services, which are requested through `curl /predict`. We simply construct a real-time request and send it to the predict server. -``` -curl -X POST 127.0.0.1:8881/predict -d '{"ip": 114904, - "app": 11, - "device": 1, - "os": 15, - "channel": 319, - "click_time": 1509960088000, - "is_attributed": 0}' -``` -The returned results are as follows: - -![predict](../../zh/use_case/images/ds_predict.png) - -#### Supplement - -If you rerun the workflow, `deploy sql` task may failed cause deployment`demo` is exists. Please delete the deployment in container before rerun the workflow: -``` -/work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client --database=demo_db --interactive=false --cmd="drop deployment demo;" -``` - -You can check if deployment is deleted: -``` -/work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client --database=demo_db --interactive=false --cmd="show deployment demo;" -``` - -Restart the DolphinScheduler server(the metadata will be cleaned, you need to reset the config and create the workflow again): -``` -./bin/dolphinscheduler-daemon.sh stop standalone-server -./bin/dolphinscheduler-daemon.sh start standalone-server -``` - -If you want to store the metadata,check [Pseudo-Cluster Deployment](https://dolphinscheduler.apache.org/en-us/docs/3.1.5/guide/installation/pseudo-cluster) to use the database. diff --git a/docs/en/use_case/kafka_connector_demo.md b/docs/en/use_case/kafka_connector_demo.md deleted file mode 100644 index 70288b0001d..00000000000 --- a/docs/en/use_case/kafka_connector_demo.md +++ /dev/null @@ -1,224 +0,0 @@ -# Importing Real-Time Data Streams from Kafka - -## Introduction - -Apache Kafka is an event streaming platform. It can be used as the online data source of OpenMLDB, import the real-time data from data stream into OpenMLDB online. For more information about Kafka, please refer to the official website https://kafka.apache.org/. We have developed a Kafka connector to bridge the OpenMLDB, which can connect Kafka and OpenMLDB without obstacles. In this document, you will learn the concept and usage of this connector. - -Please note that in order to make the demonstration easier, this article will use the Kafka Connect standalone mode to start the connector. The connector can be started in the distributed mode. - -:::{seealso} - -For OpenMLDB Kafka Connector implementation, please refer to [extensions/kafka-connect-jdbc](https://github.com/4paradigm/OpenMLDB/tree/main/extensions/kafka-connect-jdbc). -::: - -## Overview - -### Download and Preparation - -- Download Kafka: please click [kafka downloads](https://kafka.apache.org/downloads) to download `kafka_2.13-3.1.0.tgz`. -- Download the connector package and dependencies: please click on [kafka-connect-jdbc.tgz](https://github.com/4paradigm/OpenMLDB/releases/download/v0.5.0/kafka-connect-jdbc.tgz). -- Download the configuration and script files (for the demonstration purpose used in this article): please click on [kafka_demo_files.tgz](http://openmldb.ai/download/kafka-connector/kafka_demo_files.tgz). - -This article will start the OpenMLDB in docker container, so there is no need to download the OpenMLDB separately. Moreover, Kafka and connector can be started in the same container. We recommend that you save the three downloaded packages to the same directory. Let's assume that the packages are in the `/work/kafka` directory. - -``` -docker run -it -v `pwd`:/work/kafka --name openmldb 4pdosc/openmldb:0.8.4 bash -``` - -### Steps - -The brief process of using the connector is shown in the figure below. We will describe each step in detail next. - -In general, the use process can be summarized into four steps: - -1. Start OpenMLDB and create the database -2. Start Kafka and create topic -3. Start OpenMLDB Kafka Connector -4. Proceed for test or normal use - -![demo steps](../../zh/use_case/images/kafka_connector_steps.png) - - -## Step 1: Start the OpenMLDB and Create a Database - -### Start the OpenMLDB Cluster - -In the OpenMLDB container, start the cluster: - -``` -/work/init.sh -``` - -:::{caution} - -At present, only the OpenMLDB cluster version can be used as the receiver of sink, and the data will only be sink to the online storage of the cluster. -::: - -### Create Database - -We can quickly create a database through the pipe without logging into the client CLI: - -``` -echo "create database kafka_test;" | /work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client -``` - -## Step 2: Start the Kafka and Create topic - -### Start Kafka - -Unzip the Kafka and start the Kafka using the start script. - -``` -cd kafka -tar -xzf kafka_2.13-3.1.0.tgz -cd kafka_2.13-3.1.0 -./bin/kafka-server-start.sh -daemon config/server.properties -``` - -:::{note} - -The OpenMLDB service has used port 2181 to start zookeeper. Kafka does not need to start zookeeper again. Therefore, you only need to start the server here. -::: - -You can check whether Kafka is working normally. You can use `ps` to check. If the Kafka start failed, check the log `logs/server.log`. - -``` -ps axu|grep kafka -``` - -### Create Topics - -We create a topic named `topic1`. Please note that special characters should not appear in the name of the topic. - -``` -./bin/kafka-topics.sh --create --topic topic1 --bootstrap-server localhost:9092 -``` - -You can `describe` the topic to confirm whether it is normal. - -``` -./bin/kafka-topics.sh --describe --topic topic1 --bootstrap-server localhost:9092 -``` - -![topic status](../../zh/use_case/images/kafka_topic_describe.png) - -## Step 3: Start the Connector - -First, unzip the connector and the kafka_demo_files package in `/work/kafka`. - -``` -cd /work/kafka -tar zxf kafka-connect-jdbc.tgz -tar zxf kafka_demo_files.tgz -``` - -kafka_demo_files has the configuration files which are required to start the connector. And ensure to put the connector plug-in in the correct location. - -The first configuration file is the configuration of the connector itself, `connect-standalone.properties`. The key configuration of the `plugin.path` is as follows: - -``` -plugin.path=/usr/local/share/java -``` - -Connector and all dependent packages required to run it need to be put into this directory. The command is as follows: - -``` -mkdir -p /usr/local/share/java -cp -r /work/kafka/kafka-connect-jdbc /usr/local/share/java/ -``` - -The second configuration is the `openmldb-sink.properties` which is the config to connect the OpenMLDB cluster, as follows: - -``` -name=test-sink -connector.class=io.confluent.connect.jdbc.JdbcSinkConnector -tasks.max=1 -topics=topic1 -connection.url=jdbc:openmldb:///kafka_test?zk=127.0.0.1:2181&zkPath=/openmldb -auto.create=true -``` - -```{tip} -See [Configuring Connectors](https://kafka.apache.org/documentation/#connect_configuring) for full details about the config options. - -The option `connection.url` should be the right OpenMLDB address and database. The database must exist. -``` - -In the connection configuration, you need to fill in the correct OpenMLDB URL address. The connector receives the message of topic1 and automatically creates a table (auto.create). - -Next, start the connector using the Kafka connector standalone mode. - -``` -cd /work/kafka/kafka_2.13-3.1.0 -./bin/connect-standalone.sh -daemon ../kafka_demo_files/connect-standalone.properties ../kafka_demo_files/openmldb-sink.properties -``` - -Check whether the connector is started and correctly connected to the OpenMLDB cluster. You can check with `logs/connect.log`. Under normal circumstances, the log should have `Executing sink task`. - -## Step 4: Test - -### Send Messages - -We use the console producer provided by Kafka as the message sending tool for testing. - -Since we haven't created a table yet, our message should contain the schema to help Kafka parse the message and write it to OpenMLDB. - -``` -{"schema":{"type":"struct","fields":[{"type":"int16","optional":true,"field":"c1_int16"},{"type":"int32","optional":true,"field":"c2_int32"},{"type":"int64","optional":true,"field":"c3_int64"},{"type":"float","optional":true,"field":"c4_float"},{"type":"double","optional":true,"field":"c5_double"},{"type":"boolean","optional":true,"field":"c6_boolean"},{"type":"string","optional":true,"field":"c7_string"},{"type":"int64","name":"org.apache.kafka.connect.data.Date","optional":true,"field":"c8_date"},{"type":"int64","name":"org.apache.kafka.connect.data.Timestamp","optional":true,"field":"c9_timestamp"}],"optional":false,"name":"foobar"},"payload":{"c1_int16":1,"c2_int32":2,"c3_int64":3,"c4_float":4.4,"c5_double":5.555,"c6_boolean":true,"c7_string":"c77777","c8_date":19109,"c9_timestamp":1651051906000}} -``` - -More conveniently, we save the above message in the file `kafka_demo_files/message` where you can use it directly to send message to the Kafka with the console producer. - -``` -./bin/kafka-console-producer.sh --topic topic1 --bootstrap-server localhost:9092 < ../kafka_demo_files/message -``` - -```{tip} -If you want to send messages without the schema,but you don't have Schema Registry. You can create the table in OpenMLDB, and set `auto.schema=true` in Kafka connector, see [kafka connect jdbc doc](https://github.com/4paradigm/OpenMLDB/blob/main/extensions/kafka-connect-jdbc/DEVELOP.md) for full details. Only support to use with JsonConverter. -``` - -### Check Results - -We can query OpenMLDB to check whether the insertion is successful. The query script of `kafka_demo_files/select.sql` is as follows: - -``` -set @@execute_mode='online'; -use kafka_test; -select * from topic1; -``` - -You can directly run the query script with a query: - -``` -/work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client < ../kafka_demo_files/select.sql -``` - -![openmldb result](../../zh/use_case/images/kafka_openmldb_result.png) - -## Debug - -### Logs - -Kafka server log is `log/server.log`, check it if the Kafka server can't work. - -And the connector log is `log/connect.log`, check it if the producer failed or can't get the result in OpenMLDB. - -### Reinit - -If you met some error, you can reinitialize the environment to retry. - -To terminate kafka, kill the two daemon process: -``` -ps axu|grep kafka | grep -v grep | awk '{print $2}' | xargs kill -9 -``` - -To delete the data, ref [TERMINATE THE KAFKA ENVIRONMENT](https://kafka.apache.org/quickstart#quickstart_kafkaterminate): -``` -rm -rf /tmp/kafka-logs /tmp/kraft-combined-logs -``` - -Plz DO NOT kill zookeeper process or delete `/tmp/zookeeper` here, cuz OpenMLDB use the same zookeeper cluster too. We will kill the zookeeper process and delete the zookeeper data dir when we reinitialize the OpenMLDB cluster: -``` -/work/init.sh -``` -And then create the database in OpenMLDB, start the Kafka ... diff --git a/docs/en/use_case/lightgbm_demo.md b/docs/en/use_case/lightgbm_demo.md deleted file mode 100644 index c1310fdea66..00000000000 --- a/docs/en/use_case/lightgbm_demo.md +++ /dev/null @@ -1,206 +0,0 @@ -# OpenMLDB + LightGBM: Taxi Trip Duration Prediction - -In this document, we will take [the taxi travel time prediction problem on Kaggle as an example](https://www.kaggle.com/c/nyc-taxi-trip-duration/overview) to demonstrate how to use the OpenMLDB and LightGBM together to build a complete machine learning application. - -Note that: (1) this case is based on the OpenMLDB cluster version for tutorial demonstration; (2) this document uses the pre-compiled docker image. If you want to test it in the OpenMLDB environment compiled and built by yourself, you need to configure and use our [Spark Distribution for Feature Engineering Optimization](https://github.com/4paradigm/spark). Please refer to relevant documents of [compilation](https://openmldb.ai/docs/en/main/deploy/compile.html) (Refer to Chapter: "Spark Distribution Optimized for OpenMLDB") and the [installation and deployment documents](https://openmldb.ai/docs/en/main/deploy/install_deploy.html) (Refer to the section: [Deploy TaskManager](https://openmldb.ai/docs/en/main/deploy/install_deploy.html#deploy-taskmanager)). - -### 1. Preparation and Preliminary Knowledge - -#### 1.1. Pull and Start the OpenMLDB Docker Image - -- Note: Please make sure that the Docker Engine version number is > = 18.03 - -- Pull the OpenMLDB docker image and run the corresponding container: - -```bash -docker run -it 4pdosc/openmldb:0.8.4 bash -``` - -The image is preinstalled with OpenMLDB and preset with all scripts, third-party libraries, open-source tools and training data required for this case. - -```{note} -Note that all the commands below run in the docker container by default, and are assumed to be in the default directory (`/work/taxi-trip`). -``` - -#### 1.2. Initialize Environment - -```bash -./init.sh -cd taxi-trip -``` - -We provide the init.sh script in the image that helps users to quickly initialize the environment including: - -- Configure zookeeper - -- Start cluster version OpenMLDB - -#### 1.3. Start OpenMLDB CLI Client - -```bash -/work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client -``` - -```{note} -Note that most of the commands in this tutorial are executed under the OpenMLDB CLI. In order to distinguish from the ordinary shell environment, the commands executed under the OpenMLDB CLI use a special prompt of >. -``` - -#### 1.4. Preliminary Knowledge: Non-Blocking Task of Cluster Version - -Some commands in the cluster version are non-blocking tasks, including `LOAD DATA` in online mode and `LOAD DATA`, `SELECT`, `SELECT INTO` commands in the offline mode. After submitting a task, you can use relevant commands such as `SHOW JOBS` and `SHOW JOB` to view the task progress. For details, see the offline task management document. - -### 2. Machine Learning Based on OpenMLDB and LightGBM - -#### 2.1. Creating Databases and Data Tables - -The following commands are executed in the OpenMLDB CLI environment. - -```sql -> CREATE DATABASE demo_db; -> USE demo_db; -> CREATE TABLE t1(id string, vendor_id int, pickup_datetime timestamp, dropoff_datetime timestamp, passenger_count int, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, store_and_fwd_flag string, trip_duration int); -``` - -#### 2.2. Offline Data Preparation - -First, you need to switch to offline execution mode. Next, import the sample data `/work/taxi-trip/data/taxi_tour_table_train_simple.csv` as offline data that is used for offline feature calculation. - -The following commands are executed under the OpenMLDB CLI. - -```sql -> USE demo_db; -> SET @@execute_mode='offline'; -> LOAD DATA INFILE '/work/taxi-trip/data/taxi_tour_table_train_simple.snappy.parquet' INTO TABLE t1 options(format='parquet', header=true, mode='append'); -``` - -```{note} -Note that `LOAD DATA` is a non-blocking task. You can use the command `SHOW JOBS` to view the running status of the task. Please wait for the task to run successfully (`state` to `FINISHED` status) before proceeding to the next step. -``` - -#### 2.3. The Feature Extraction Script - -Usually, users need to analyze the data according to the goal of machine learning before designing the features, and then design and investigate the features according to the analysis. Data analysis and feature research of the machine learning are not the scope of this paper, and we will not expand it. We assumes that users already have the basic theoretical knowledge of machine learning, the ability to solve machine learning problems, the ability to understand SQL syntax, and the ability to use SQL syntax to construct features. - -For this case, the user has designed several features after the analysis and research: - -| Feature Name | Feature Meaning | SQL Feature Representation | -| --------------- | ------------------------------------------------------------ | --------------------------------------- | -| trip_duration | Travel time of a single trip | `trip_duration` | -| passenger_count | Number of passengers | `passenger_count` | -| vendor_sum_pl | Cumulative number of taxis of the same brand in the time window in the past 1 day (pickup_latitude) | `sum(pickup_latitude) OVER w` | -| vendor_max_pl | The largest number of taxis of the same brand in the time window in the past 1 day (pickup_latitude) | `max(pickup_latitude) OVER w` | -| vendor_min_pl | The minimum number of taxis of the same brand in the time window in the past 1 day (pickup_latitude) | `min(pickup_latitude) OVER w` | -| vendor_avg_pl | Average number of taxis of the same brand in the time window in the past 1 day (pickup_latitude) | `avg(pickup_latitude) OVER w` | -| pc_sum_pl | Cumulative trips of the same passenger capacity in the time window in the past 1 day (pickup_latitude) | `sum(pickup_latitude) OVER w2` | -| pc_max_pl | The maximum number of trips with the same passenger capacity in the time window in the past 1 day (pickup_latitude) | `max(pickup_latitude) OVER w2` | -| pc_min_pl | The minimum number of trips with the same passenger capacity in the time window in the past 1 day (pickup_latitude) | `min(pickup_latitude) OVER w2` | -| pc_avg_pl | Average number of trips with the same passenger capacity in the time window in the past 1 day (pickup_latitude) | `avg(pickup_latitude) OVER w2` | -| pc_cnt | The total number of trips with the same passenger capacity in the time window in the past 1 day | `count(vendor_id) OVER w2` | -| vendor_cnt | Total trips of taxis of the same brand in the time window in the past 1 day | `count(vendor_id) OVER w AS vendor_cnt` | - -#### 2.4. Offline Feature Extraction - -In the offline mode, the user extracts features and outputs the feature results to `/tmp/feature_data` that is saved in the data directory for subsequent model training. The `SELECT` command corresponds to the SQL feature extraction script generated based on the above table. The following commands are executed under the OpenMLDB CLI. - -```sql -> USE demo_db; -> SET @@execute_mode='offline'; -> SELECT trip_duration, passenger_count, -sum(pickup_latitude) OVER w AS vendor_sum_pl, -max(pickup_latitude) OVER w AS vendor_max_pl, -min(pickup_latitude) OVER w AS vendor_min_pl, -avg(pickup_latitude) OVER w AS vendor_avg_pl, -sum(pickup_latitude) OVER w2 AS pc_sum_pl, -max(pickup_latitude) OVER w2 AS pc_max_pl, -min(pickup_latitude) OVER w2 AS pc_min_pl, -avg(pickup_latitude) OVER w2 AS pc_avg_pl, -count(vendor_id) OVER w2 AS pc_cnt, -count(vendor_id) OVER w AS vendor_cnt -FROM t1 -WINDOW w AS (PARTITION BY vendor_id ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW), -w2 AS (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW) INTO OUTFILE '/tmp/feature_data'; -``` - -Note that the cluster version `SELECT INTO` is a non-blocking task. You can use the command `SHOW JOBS` to view the running status of the task. Please wait for the task to run successfully (`state` to `FINISHED` status) before proceeding to the next step. - -#### 2.5. Model Training - -1. Model training will not be carry out in the OpenMLDB thus, exit the OpenMLDB CLI through the following `quit` command. - -```bash -> quit -``` - -2. Then in the command line, you execute train.py. It uses the open-source training tool `lightgbm` to train the model based on the offline features generated in the previous step, and the training results are stored in `/tmp/model.txt`. - -```bash -python3 train.py /tmp/feature_data /tmp/model.txt -``` - -#### 2.6. Online SQL Deployment - -Assuming that the model produced by the features designed in Section 2.3 in the previous model training meets the expectation. The next step is to deploy the feature extraction SQL script online to provide real-time feature extraction. - -1. Restart OpenMLDB CLI for SQL online deployment - -```bash -/work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client -``` - -2. To execute online deployment, the following commands are executed in OpenMLDB CLI. - -```sql -> USE demo_db; -> SET @@execute_mode='online'; -> DEPLOY demo OPTIONS(RANGE_BIAS='inf', ROWS_BIAS='inf') SELECT trip_duration, passenger_count, -sum(pickup_latitude) OVER w AS vendor_sum_pl, -max(pickup_latitude) OVER w AS vendor_max_pl, -min(pickup_latitude) OVER w AS vendor_min_pl, -avg(pickup_latitude) OVER w AS vendor_avg_pl, -sum(pickup_latitude) OVER w2 AS pc_sum_pl, -max(pickup_latitude) OVER w2 AS pc_max_pl, -min(pickup_latitude) OVER w2 AS pc_min_pl, -avg(pickup_latitude) OVER w2 AS pc_avg_pl, -count(vendor_id) OVER w2 AS pc_cnt, -count(vendor_id) OVER w AS vendor_cnt -FROM t1 -WINDOW w AS (PARTITION BY vendor_id ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW), -w2 AS (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW); -``` - -#### 2.7. Online Data Import - -We need to import the data for real-time feature extraction. First, you need to switch to **online** execution mode. Then, in the online mode, import the sample data `/work/taxi-trip/data/taxi_tour_table_train_simple.csv` as the online data source. The following commands are executed under the OpenMLDB CLI. - -```sql -> USE demo_db; -> SET @@execute_mode='online'; -> LOAD DATA INFILE 'file:///work/taxi-trip/data/taxi_tour_table_train_simple.csv' INTO TABLE t1 options(format='csv', header=true, mode='append'); -``` - -Note that the cluster version `SELECT INTO` is a non-blocking task. You can use the command `SHOW JOBS` to view the running status of the task. Please wait for the task to run successfully (`state` to `FINISHED` status) before proceeding to the next step. - -#### 2.8. Start Online Prediction Service - -1. If you have not exited the OpenMLDB CLI, use the `quit` command to exit the OpenMLDB CLI. -2. Start the prediction service from the command line: - -``` -./start_predict_server.sh 127.0.0.1:9080 /tmp/model.txt -``` - -#### 2.9. Send Real-Time Request - -The `predict.py` script will send a line of request data to the prediction service. A returned results will be received and finally, prints them out. - -```bash -# Run inference with a HTTP request -python3 predict.py -# The following output is expected (the numbers might be slightly different) -----------------ins--------------- -[[ 2. 40.774097 40.774097 40.774097 40.774097 40.774097 40.774097 - 40.774097 40.774097 1. 1. ]] ----------------predict trip_duration ------------- -848.014745715936 s -``` - diff --git a/docs/en/use_case/pulsar_connector_demo.md b/docs/en/use_case/pulsar_connector_demo.md deleted file mode 100644 index dd3733d291b..00000000000 --- a/docs/en/use_case/pulsar_connector_demo.md +++ /dev/null @@ -1,270 +0,0 @@ -# Importing Real-Time Data Streams from Pulsar - -## Introduction - -Apache Pulsar is a cloud-native, distributed messaging and streaming platform. It can be used as online data source for OpenMLDB to import real-time data streams. You can learn more about Pulsar from the project website [https://pulsar.apache.org/](https://pulsar.apache.org/). We have developed an OpenMLDB JDBC Connector to work seamlessly with Pulsar. In this document, you will learn the concepts and usages of this connector. - -Note that, for the sake of simplicity, for this document, we use Pulsar Standalone, OpenMLDB cluster and a simple JSON message producer to show how the OpenMLDB JDBC Connector works. The connector also works well with the Pulsar Cluster. - -## Overview - -### Download - -- You can download the entire demo package [here](https://openmldb.ai/download/pulsar-connector/files.tar.gz), which are needed by this demo, including the connector nar, schema files, and config files. - -- If you would like to download the connector only, you can [download it here](https://github.com/4paradigm/OpenMLDB/releases/download/v0.4.4/pulsar-io-jdbc-openmldb-2.11.0-SNAPSHOT.nar) from the OpenMLDB release. - -### Workflow - -The below figure summarizes the workflow of using this connector. We will further explain the detail later. Moreover, we have recorded the steps at [terminalizer page](https://terminalizer.com/view/be2309235671) for easy reference; or you can also download the demo script [demo.yml](https://github.com/vagetablechicken/pulsar-openmldb-connector-demo/blob/main/demo.yml). -![demo steps](images/demo_steps.png) - - -## Step 1 -### Create OpenMLDB Cluster -Use docker to start it simply, and we need to create a test table, you can check on [Get started with cluster version of OpenMLDB](https://openmldb.ai/docs/en/v0.5/quickstart/openmldb_quickstart.html#get-started-with-cluster-version-of-openmldb) . -```{caution} -Only OpenMLDB cluster mode can be the sink dist, and only write to online storage. -``` - -We recommend that you use ‘host network’ to run docker. And bind volume ‘files’ too. The sql scripts are in it. -``` -docker run -dit --network host -v `pwd`/files:/work/pulsar_files --name openmldb 4pdosc/openmldb:0.8.4 bash -docker exec -it openmldb bash -``` -```{note} -Even the host network, docker on macOS cannot support connecting to the container from the host. You only can connect openmldb cluster in other containers, like pulsar container. -``` -In OpenMLDB container, start the cluster: -``` -./init.sh -``` -### Create table -We use a script to create the table, create.sql content: -``` -create database pulsar_test; -use pulsar_test; -create table connector_test(id string, vendor_id int, pickup_datetime bigint, dropoff_datetime bigint, passenger_count int, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, store_and_fwd_flag string, trip_duration int); -desc connector_test; -``` -Run the script: -``` -/work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client < /work/pulsar_files/create.sql -``` - -![table desc](images/table.png) - -```{note} -JSONSchema and JDBC base connector can't support 'java.sql.Timestamp' now. So we use 'bigint' to be the timestamp column type(it works in OpenMLDB). -``` -## Step 2 -### Start Pulsar Standalone -It’s simpler and quicker to run Pulsar in docker. - -We **recommend** that you use 'host network' to run docker, to avoid network problems about docker containers. - -And we need to use pulsar-admin to create a sink, it’s in the docker container. So we will run the container in bash first, and run cmds in it. - -Don’t forget to bind the dir ‘files’. - -``` -docker run -dit --network host -v `pwd`/files:/pulsar/files --name pulsar apachepulsar/pulsar:2.9.1 bash -docker exec -it pulsar bash -``` - -In Pulsar container, start the pulsar standalone server. -``` -bin/pulsar-daemon start standalone --zookeeper-port 5181 -``` -```{note} -OpenMLDB want to use the port 2181, so we should change the zk port here. We will use zk port 2181 to connect OpenMLDB, but zk port in Pulsar standalone won’t affect anything. -``` -You can `ps` to check if the pulsar runs well. If failed, check the standalone server log `logs/pulsar-standalone-....log`. -``` -ps axu|grep pulsar -``` - -When you start a local standalone cluster, a public/default namespace is created automatically. The namespace is used for development purposes, ref [pulsar doc](https://pulsar.apache.org/docs/en/2.9.0/standalone/#start-pulsar-standalone). - -**We will create the sink in the namespace**. - -```{seealso} -If you really want to start pulsar locally, see [Set up a standalone Pulsar locally](https://pulsar.apache.org/docs/en/standalone/). -``` -#### Q&A -Q: -``` -2022-04-07T03:15:59,289+0000 [main] INFO org.apache.zookeeper.server.NIOServerCnxnFactory - binding to port 0.0.0.0/0.0.0.0:5181 -2022-04-07T03:15:59,289+0000 [main] ERROR org.apache.pulsar.zookeeper.LocalBookkeeperEnsemble - Exception while instantiating ZooKeeper -java.net.BindException: Address already in use -``` -How to fix it? -A: Pulsar wants an unused address to start zk server,5181 is used too. Change another port in '--zookeeper-port'. - -Q: 8080 is already used? -A: change the port 'webServicePort' in `conf/standalone.conf`. Don’t forget the 'webServiceUrl' in `conf/client.conf`, pulsar-admin needs the conf. - -Q: 6650 is already used? -A: change 'brokerServicePort' in `conf/standalone.conf` and 'brokerServiceUrl' in `conf/client.conf`. - -### Connector installation(Optional) -In the previous step, we bind mount ‘files’, the connector nar is in it. -We’ll use ‘non built-in connector’ mode to set up the connector(use ‘archive’ in sink config). - -If you really want the connector to be the built-in connector, copy it to ‘connectors’. -``` -mkdir connectors -cp files/pulsar-io-jdbc-openmldb-2.11.0-SNAPSHOT.nar connectors/ -``` -You want to change or add more connectors, you can update connectors when pulsar standalone is running: -``` -bin/pulsar-admin sinks reload -``` - -Built-in OpenMLDB connector's sink type is 'jdbc-openmldb'. - -### Create sink -We use the 'public/default' namespace to create sink, and we need a sink config file, it’s `files/pulsar-openmldb-jdbc-sink.yaml`, content: -``` - tenant: "public" - namespace: "default" - name: "openmldb-test-sink" - archive: "files/pulsar-io-jdbc-openmldb-2.11.0-SNAPSHOT.nar" - inputs: ["test_openmldb"] - configs: - jdbcUrl: "jdbc:openmldb:///pulsar_test?zk=localhost:2181&zkPath=/openmldb" - tableName: "connector_test" -``` -```{describe} -'name' is the sink name. - -We use 'archive' to set the sink connector, so we use openmldb connector as non built-in connector. - -'input' means the topic names, we use only one here. - -'config' is jdbc config which used to connect openmldb cluster. -``` - -Then create a sink and check it, notice that the input topic is 'test_openmldb'. -``` -./bin/pulsar-admin sinks create --sink-config-file files/pulsar-openmldb-jdbc-sink.yaml -./bin/pulsar-admin sinks status --name openmldb-test-sink -``` -![init sink status](images/init_sink_status.png) - -### Create Schema -Upload schema to topic 'test_openmldb', schema type is JSON. We’ll produce the JSON message in the same schema later. The schema file is ‘files/openmldb-table-schema’. -Schema content: -``` - { - "type": "JSON", - "schema":"{\"type\":\"record\",\"name\":\"OpenMLDBSchema\",\"namespace\":\"com.foo\",\"fields\":[{\"name\":\"id\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"vendor_id\",\"type\":\"int\"},{\"name\":\"pickup_datetime\",\"type\":\"long\"},{\"name\":\"dropoff_datetime\",\"type\":\"long\"},{\"name\":\"passenger_count\",\"type\":\"int\"},{\"name\":\"pickup_longitude\",\"type\":\"double\"},{\"name\":\"pickup_latitude\",\"type\":\"double\"},{\"name\":\"dropoff_longitude\",\"type\":\"double\"},{\"name\":\"dropoff_latitude\",\"type\":\"double\"},{\"name\":\"store_and_fwd_flag\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"trip_duration\",\"type\":\"int\"}]}", - "properties": {} - } -``` - -Upload schema and check it, commands: -``` -./bin/pulsar-admin schemas upload test_openmldb -f ./files/openmldb-table-schema -./bin/pulsar-admin schemas get test_openmldb -``` -For demonstration purposes, we omit the fields part. The result as follows: -![topic schema](images/topic_schema.png) -## Test -### Send messages -We use the first 2 rows of sample data(in openmldb docker `data/taxi_tour_table_train_simple.csv`) to be the test messages, as follows. - -![test data](images/test_data.png) - -#### Java Producer -Producer JAVA code in [demo producer](https://github.com/vagetablechicken/pulsar-client-java). Essential code is ![snippet](images/producer_code.png) - -So the producer will send the 2 messages to topic ‘test_openmldb’. And then Pulsar will read the messages and write them to OpenMLDB cluster online storage. - -The package is in ‘files’. You can run it directly. - -``` -java -cp files/pulsar-client-java-1.0-SNAPSHOT-jar-with-dependencies.jar org.example.Client -``` - -#### Python Producer -You can write the Producer in Python, please check the code in `files/pulsar_client.py`. -Before run it, you should install the pulsar python client: -``` -pip3 install pulsar-client==2.9.1 -``` -Then run the producer: -``` -python3 files/pulsar_client.py -``` - -### Check -#### Check in Pulsar -We can check the sink status: -``` -./bin/pulsar-admin sinks status --name openmldb-test-sink -``` -![sink status](images/sink_status.png) -```{note} -"numReadFromPulsar": pulsar sent 2 messages to the sink instance. - -"numWrittenToSink": sink instance write 2 messages to OpenMLDB. -``` - -#### Check in OpenMLDB -And we can get these messages data in the OpenMLDB table’s **online storage** now. -The script select.sql content: -``` -set @@execute_mode='online'; -use pulsar_test; -select *, string(timestamp(pickup_datetime)), string(timestamp(dropoff_datetime)) from connector_test; -``` -In OpenMLDB container, run: -``` -/work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client < /work/pulsar_files/select.sql -``` -![openmldb result](images/openmldb_result.png) - -### Debug - -If the OpenMLDB table doesn't have the data, but the sinks status shows it has written to OpenMLDB, the sink instance may have some problems. You should check the sink log, the path is `logs/functions/public/default/openmldb-test-sink/openmldb-test-sink-0.log`. If you use another sink name, the path will change. - -Pulsar will retry to write the failed messages. So if you sent the wrong message 1 and then sent the right message 2, even the right message 2 has written to OpenMLDB, the wrong message 1 will be sent and print the error in log. It's confusing. We'd recommend you to truncate the topic before testing again. -``` -./bin/pulsar-admin topics truncate persistent://public/default/test_openmldb -``` -If you use another sink name, you can get it by `./bin/pulsar-admin topics list public/default`. - -#### debug log - -If the sink instance log is not enough, you can open the debug level of log. You should modify the log config, and restart the sink instance. - -`vim conf/functions_log4j2.xml` and modify it: - -```xml - - pulsar.log.level - debug - -``` -```xml - - ${sys:pulsar.log.level} - - ${sys:pulsar.log.appender} - ${sys:pulsar.log.level} - - -``` - -Then restart the sink instance: -``` -./bin/pulsar-admin sinks restart --name openmldb-test-sink -``` - -#### reinitialize Pulsar -``` -bin/pulsar-daemon stop standalone --zookeeper-port 5181 -rm -r data logs -bin/pulsar-daemon start standalone --zookeeper-port 5181 -``` diff --git a/docs/poetry.lock b/docs/poetry.lock index 724b4f19340..3b2ecfb5fc7 100644 --- a/docs/poetry.lock +++ b/docs/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "alabaster" @@ -133,13 +133,13 @@ testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs [[package]] name = "jinja2" -version = "3.1.2" +version = "3.1.3" description = "A very fast and expressive template engine." optional = false python-versions = ">=3.7" files = [ - {file = "Jinja2-3.1.2-py3-none-any.whl", hash = "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"}, - {file = "Jinja2-3.1.2.tar.gz", hash = "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852"}, + {file = "Jinja2-3.1.3-py3-none-any.whl", hash = "sha256:7d6d50dd97d52cbc355597bd845fabfbac3f551e1f99619e39a35ce8c370b5fa"}, + {file = "Jinja2-3.1.3.tar.gz", hash = "sha256:ac8bd6544d4bb2c9792bf3a159e80bba8fda7f07e81bc3aed565432d5925ba90"}, ] [package.dependencies] diff --git a/docs/zh/about/index.rst b/docs/zh/about/index.rst index 24ba7218e5b..012ae28e362 100644 --- a/docs/zh/about/index.rst +++ b/docs/zh/about/index.rst @@ -8,4 +8,5 @@ intro community 发展历程 - 更新日志 \ No newline at end of file + 更新日志 + 介绍材料 \ No newline at end of file diff --git a/docs/zh/app_ecosystem/feat_insight/faq.md b/docs/zh/app_ecosystem/feat_insight/faq.md new file mode 100644 index 00000000000..1d260f2bdbf --- /dev/null +++ b/docs/zh/app_ecosystem/feat_insight/faq.md @@ -0,0 +1,38 @@ +# 常见问题 + +## FeatInsight 和主流 Feature Store 有什么区别? + +主流 Feature Store 包括 Feast、Tecton、Feathr 等提供了特征管理和计算能力,在线存储主要使用 Redis 等预聚合 Key-value 存储。FeatInsight 提供的是实时计算特征的能力,特征抽取方案无论怎样修改都可以直接一键上线而不需要重新上线和同步在线数据。主要的功能对比如下。 + +| 特征存储系统 | Feast | Tecton | Feathr | FeatInsight | +| ----------------- | ------------------ | ----------------- | ----------------- | ----------------- | +| 数据源支持 | 多种数据源 | 多种数据源 | 多种数据源 | 多种数据源 | +| 可扩展性 | 高 | 高 | 中到高 | 高 | +| 实时特征服务 | 支持 | 支持 | 支持 | 支持 | +| 批处理特征服务 | 支持 | 支持 | 支持 | 支持 | +| 特征转换 | 支持基本转换 | 支持复杂转换和 SQL | 支持复杂转换 | 支持复杂转换和 SQL | +| 数据存储 | 支持多种存储选项 | 主要支持云存储 | 支持多种存储选项 | 内置高性能时序数据库,支持多种存储选项 | +| 社区和支持 | 开源社区 | 商业支持 | 开源社区 | 开源社区 | +| 实时特征计算 | 不支持 | 不支持 | 不支持 | 支持 | + +## 部署 FeatInsight 是否需要 OpenMLDB ? + +需要,因为 FeatInsight 的元数据存储以及特征计算依赖 OpenMLDB 集群,因此部署 FeatInsight 需要提前部署 OpenMLDB 集群,也可以使用整合两者的 [Docker 镜像](./install/docker.md)一键部署。 + +使用 FeatInsight 后用户可以不依赖 OpenMLDB CLI 或 SDK 来实现特征的开发和上线,通过 Web 界面就可以完成特征工程的所有上线需求。 + +## 如何基于 FeatInsight 实现 MLOps 工作流? + +使用 FeatInsight 可以在 Web 前端完成数据库、数据表的创建,然后提交在线数据和离线数据的导入工作。使用 OpenMLDB SQL 语法进行数据的探索以及特征的创建,然后就可以离线特征的导出以及在线特征的一键上线,从 MLOps 对离线到在线流程不需要任何额外的开发工作,具体流程可参考[快速入门](./quickstart.md)。 + +## FeatInsight 的生态集成支持如何? + +FeatInsight 依托于 OpenMLDB 生态,支持与 OpenMLDB 生态中的其他组件进行集成。 + +例如与 OpenMLDB 生态中的数据集成组件进行集成,支持 [Kafka](../../integration/online_datasources/kafka_connector_demo.md)、[Pulsar](../../integration/online_datasources/pulsar_connector_demo.md)、[RocketMQ](../../integration/online_datasources/rocketmq_connector.md)、[Hive](../../integration/offline_data_sources/hive.md)、[Amazon S3](../../integration/offline_data_sources/s3.md),调度系统支持 [Airflow](../../integration/deploy_integration/airflow_provider_demo.md)、[DolphinScheduler](../../integration/deploy_integration/dolphinscheduler_task_demo.md)、[Byzer](../../integration/deploy_integration/OpenMLDB_Byzer_taxi.md) 等,对于 Spark Connector 支持的 HDFS、Iceberg 等和云计算相关的 Kubernetes、阿里云 MaxCompute 等也有一定程度的支持。 + +## FeatInsight 有什么业务价值和技术含量? + +相比于使用 HDFS 存储离线数据、Redis 存储在线数据的简易版 Feature Store,FeatInsight 的价值在于使用了 OpenMLDB SQL 这种在线离线一致性的特征抽取语言。对于特征开发的科学家,只需要编写 SQL 逻辑就可以完成特征定义,在离线场景下这个 SQL 会被翻译成分布式 Spark 应用来执行,在在线场景下同样的 SQL 会被翻译成在线时序数据库的查询语句来执行,实现特征的在线和离线一致性。 + +目前 SQL 编译器、在线存储引擎、离线计算引擎都是基于 C++ 和 Scala 等编程语言实现的,对于非技术背景的科学家来说,使用 SQL 语言来定义特征开发流程,可以降低学习成本,提高开发效率。所有代码都是开源可用,OpenMLDB 项目地址 https://github.com/4paradigm/openmldb ,FeatInsight 项目地址 https://github.com/4paradigm/FeatInsight 。 diff --git a/docs/zh/app_ecosystem/feature_platform/functions/computed_features.md b/docs/zh/app_ecosystem/feat_insight/functions/computed_features.md similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/functions/computed_features.md rename to docs/zh/app_ecosystem/feat_insight/functions/computed_features.md diff --git a/docs/zh/app_ecosystem/feature_platform/functions/import_data.md b/docs/zh/app_ecosystem/feat_insight/functions/import_data.md similarity index 98% rename from docs/zh/app_ecosystem/feature_platform/functions/import_data.md rename to docs/zh/app_ecosystem/feat_insight/functions/import_data.md index 106cf2092c7..75d85fd79e4 100644 --- a/docs/zh/app_ecosystem/feature_platform/functions/import_data.md +++ b/docs/zh/app_ecosystem/feat_insight/functions/import_data.md @@ -3,7 +3,7 @@ ## 介绍 -OpenMLDB 特征平台在前端支持数据导入相关功能,功能如下: +FeatInsight 在前端支持数据导入相关功能,功能如下: * 数据库管理 * 创建数据库 diff --git a/docs/zh/app_ecosystem/feature_platform/functions/index.rst b/docs/zh/app_ecosystem/feat_insight/functions/index.rst similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/functions/index.rst rename to docs/zh/app_ecosystem/feat_insight/functions/index.rst diff --git a/docs/zh/app_ecosystem/feature_platform/functions/manage_center.md b/docs/zh/app_ecosystem/feat_insight/functions/manage_center.md similarity index 88% rename from docs/zh/app_ecosystem/feature_platform/functions/manage_center.md rename to docs/zh/app_ecosystem/feat_insight/functions/manage_center.md index 5633df6f826..d785a48f4a3 100644 --- a/docs/zh/app_ecosystem/feature_platform/functions/manage_center.md +++ b/docs/zh/app_ecosystem/feat_insight/functions/manage_center.md @@ -2,7 +2,7 @@ ## 介绍 -OpenMLDB 特征平台提供了对于数据表,特征,任务以及服务的管理功能,用户可以在管理中心查看和管理相关资源。 +FeatInsight 提供了对于数据表,特征,任务以及服务的管理功能,用户可以在管理中心查看和管理相关资源。 目前支持查看和管理的资源如下: diff --git a/docs/zh/app_ecosystem/feature_platform/functions/manage_feature.md b/docs/zh/app_ecosystem/feat_insight/functions/manage_feature.md similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/functions/manage_feature.md rename to docs/zh/app_ecosystem/feat_insight/functions/manage_feature.md diff --git a/docs/zh/app_ecosystem/feature_platform/functions/offline_scenario.md b/docs/zh/app_ecosystem/feat_insight/functions/offline_scenario.md similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/functions/offline_scenario.md rename to docs/zh/app_ecosystem/feat_insight/functions/offline_scenario.md diff --git a/docs/zh/app_ecosystem/feature_platform/functions/online_scenario.md b/docs/zh/app_ecosystem/feat_insight/functions/online_scenario.md similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/functions/online_scenario.md rename to docs/zh/app_ecosystem/feat_insight/functions/online_scenario.md diff --git a/docs/zh/app_ecosystem/feature_platform/functions/sql_playground.md b/docs/zh/app_ecosystem/feat_insight/functions/sql_playground.md similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/functions/sql_playground.md rename to docs/zh/app_ecosystem/feat_insight/functions/sql_playground.md diff --git a/docs/zh/app_ecosystem/feature_platform/functions/sql_tool.md b/docs/zh/app_ecosystem/feat_insight/functions/sql_tool.md similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/functions/sql_tool.md rename to docs/zh/app_ecosystem/feat_insight/functions/sql_tool.md diff --git a/docs/zh/app_ecosystem/feature_platform/images/bigscreen.png b/docs/zh/app_ecosystem/feat_insight/images/bigscreen.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/bigscreen.png rename to docs/zh/app_ecosystem/feat_insight/images/bigscreen.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/computed_feature_page.png b/docs/zh/app_ecosystem/feat_insight/images/computed_feature_page.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/computed_feature_page.png rename to docs/zh/app_ecosystem/feat_insight/images/computed_feature_page.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/computed_feature_sample.png b/docs/zh/app_ecosystem/feat_insight/images/computed_feature_sample.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/computed_feature_sample.png rename to docs/zh/app_ecosystem/feat_insight/images/computed_feature_sample.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/computed_feature_with_index.png b/docs/zh/app_ecosystem/feat_insight/images/computed_feature_with_index.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/computed_feature_with_index.png rename to docs/zh/app_ecosystem/feat_insight/images/computed_feature_with_index.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/create_database_form.png b/docs/zh/app_ecosystem/feat_insight/images/create_database_form.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/create_database_form.png rename to docs/zh/app_ecosystem/feat_insight/images/create_database_form.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/create_feature_form1.png b/docs/zh/app_ecosystem/feat_insight/images/create_feature_form1.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/create_feature_form1.png rename to docs/zh/app_ecosystem/feat_insight/images/create_feature_form1.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/create_feature_form2.png b/docs/zh/app_ecosystem/feat_insight/images/create_feature_form2.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/create_feature_form2.png rename to docs/zh/app_ecosystem/feat_insight/images/create_feature_form2.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/create_feature_service.png b/docs/zh/app_ecosystem/feat_insight/images/create_feature_service.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/create_feature_service.png rename to docs/zh/app_ecosystem/feat_insight/images/create_feature_service.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/create_feature_service_with_keys.png b/docs/zh/app_ecosystem/feat_insight/images/create_feature_service_with_keys.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/create_feature_service_with_keys.png rename to docs/zh/app_ecosystem/feat_insight/images/create_feature_service_with_keys.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/create_offline_sample.png b/docs/zh/app_ecosystem/feat_insight/images/create_offline_sample.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/create_offline_sample.png rename to docs/zh/app_ecosystem/feat_insight/images/create_offline_sample.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/create_table_form.png b/docs/zh/app_ecosystem/feat_insight/images/create_table_form.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/create_table_form.png rename to docs/zh/app_ecosystem/feat_insight/images/create_table_form.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/create_table_from_hive.png b/docs/zh/app_ecosystem/feat_insight/images/create_table_from_hive.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/create_table_from_hive.png rename to docs/zh/app_ecosystem/feat_insight/images/create_table_from_hive.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/create_table_from_parquet.png b/docs/zh/app_ecosystem/feat_insight/images/create_table_from_parquet.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/create_table_from_parquet.png rename to docs/zh/app_ecosystem/feat_insight/images/create_table_from_parquet.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/create_table_from_sql.png b/docs/zh/app_ecosystem/feat_insight/images/create_table_from_sql.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/create_table_from_sql.png rename to docs/zh/app_ecosystem/feat_insight/images/create_table_from_sql.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/create_test_feature_service.png b/docs/zh/app_ecosystem/feat_insight/images/create_test_feature_service.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/create_test_feature_service.png rename to docs/zh/app_ecosystem/feat_insight/images/create_test_feature_service.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/create_test_featureview.png b/docs/zh/app_ecosystem/feat_insight/images/create_test_featureview.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/create_test_featureview.png rename to docs/zh/app_ecosystem/feat_insight/images/create_test_featureview.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/create_test_table.png b/docs/zh/app_ecosystem/feat_insight/images/create_test_table.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/create_test_table.png rename to docs/zh/app_ecosystem/feat_insight/images/create_test_table.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/csv_import_test_table.png b/docs/zh/app_ecosystem/feat_insight/images/csv_import_test_table.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/csv_import_test_table.png rename to docs/zh/app_ecosystem/feat_insight/images/csv_import_test_table.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/database_detail.png b/docs/zh/app_ecosystem/feat_insight/images/database_detail.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/database_detail.png rename to docs/zh/app_ecosystem/feat_insight/images/database_detail.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/database_list.png b/docs/zh/app_ecosystem/feat_insight/images/database_list.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/database_list.png rename to docs/zh/app_ecosystem/feat_insight/images/database_list.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/delete_feature_view.png b/docs/zh/app_ecosystem/feat_insight/images/delete_feature_view.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/delete_feature_view.png rename to docs/zh/app_ecosystem/feat_insight/images/delete_feature_view.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/export_test_offline_samples.png b/docs/zh/app_ecosystem/feat_insight/images/export_test_offline_samples.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/export_test_offline_samples.png rename to docs/zh/app_ecosystem/feat_insight/images/export_test_offline_samples.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/feature_detail.png b/docs/zh/app_ecosystem/feat_insight/images/feature_detail.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/feature_detail.png rename to docs/zh/app_ecosystem/feat_insight/images/feature_detail.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/feature_service_detail.png b/docs/zh/app_ecosystem/feat_insight/images/feature_service_detail.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/feature_service_detail.png rename to docs/zh/app_ecosystem/feat_insight/images/feature_service_detail.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/feature_service_version_detail.png b/docs/zh/app_ecosystem/feat_insight/images/feature_service_version_detail.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/feature_service_version_detail.png rename to docs/zh/app_ecosystem/feat_insight/images/feature_service_version_detail.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/feature_services_page.png b/docs/zh/app_ecosystem/feat_insight/images/feature_services_page.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/feature_services_page.png rename to docs/zh/app_ecosystem/feat_insight/images/feature_services_page.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/feature_view_detail.png b/docs/zh/app_ecosystem/feat_insight/images/feature_view_detail.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/feature_view_detail.png rename to docs/zh/app_ecosystem/feat_insight/images/feature_view_detail.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/features_page.png b/docs/zh/app_ecosystem/feat_insight/images/features_page.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/features_page.png rename to docs/zh/app_ecosystem/feat_insight/images/features_page.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/ide_develop_featuer_platform.png b/docs/zh/app_ecosystem/feat_insight/images/ide_develop_featuer_platform.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/ide_develop_featuer_platform.png rename to docs/zh/app_ecosystem/feat_insight/images/ide_develop_featuer_platform.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/import_data.png b/docs/zh/app_ecosystem/feat_insight/images/import_data.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/import_data.png rename to docs/zh/app_ecosystem/feat_insight/images/import_data.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/import_job_result.png b/docs/zh/app_ecosystem/feat_insight/images/import_job_result.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/import_job_result.png rename to docs/zh/app_ecosystem/feat_insight/images/import_job_result.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/import_offline_from_csv.png b/docs/zh/app_ecosystem/feat_insight/images/import_offline_from_csv.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/import_offline_from_csv.png rename to docs/zh/app_ecosystem/feat_insight/images/import_offline_from_csv.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/import_offline_from_hive.png b/docs/zh/app_ecosystem/feat_insight/images/import_offline_from_hive.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/import_offline_from_hive.png rename to docs/zh/app_ecosystem/feat_insight/images/import_offline_from_hive.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/import_offline_from_parquet.png b/docs/zh/app_ecosystem/feat_insight/images/import_offline_from_parquet.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/import_offline_from_parquet.png rename to docs/zh/app_ecosystem/feat_insight/images/import_offline_from_parquet.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/import_offline_from_sql.png b/docs/zh/app_ecosystem/feat_insight/images/import_offline_from_sql.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/import_offline_from_sql.png rename to docs/zh/app_ecosystem/feat_insight/images/import_offline_from_sql.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/import_online_from_csv.png b/docs/zh/app_ecosystem/feat_insight/images/import_online_from_csv.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/import_online_from_csv.png rename to docs/zh/app_ecosystem/feat_insight/images/import_online_from_csv.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/import_online_from_hive.png b/docs/zh/app_ecosystem/feat_insight/images/import_online_from_hive.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/import_online_from_hive.png rename to docs/zh/app_ecosystem/feat_insight/images/import_online_from_hive.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/import_online_from_insert.png b/docs/zh/app_ecosystem/feat_insight/images/import_online_from_insert.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/import_online_from_insert.png rename to docs/zh/app_ecosystem/feat_insight/images/import_online_from_insert.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/import_online_from_parquet.png b/docs/zh/app_ecosystem/feat_insight/images/import_online_from_parquet.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/import_online_from_parquet.png rename to docs/zh/app_ecosystem/feat_insight/images/import_online_from_parquet.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/import_online_from_sql.png b/docs/zh/app_ecosystem/feat_insight/images/import_online_from_sql.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/import_online_from_sql.png rename to docs/zh/app_ecosystem/feat_insight/images/import_online_from_sql.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/local_test_offline_samples.png b/docs/zh/app_ecosystem/feat_insight/images/local_test_offline_samples.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/local_test_offline_samples.png rename to docs/zh/app_ecosystem/feat_insight/images/local_test_offline_samples.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/offline_jobs_page.png b/docs/zh/app_ecosystem/feat_insight/images/offline_jobs_page.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/offline_jobs_page.png rename to docs/zh/app_ecosystem/feat_insight/images/offline_jobs_page.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/offline_sample_detail.png b/docs/zh/app_ecosystem/feat_insight/images/offline_sample_detail.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/offline_sample_detail.png rename to docs/zh/app_ecosystem/feat_insight/images/offline_sample_detail.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/offline_samples_page.png b/docs/zh/app_ecosystem/feat_insight/images/offline_samples_page.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/offline_samples_page.png rename to docs/zh/app_ecosystem/feat_insight/images/offline_samples_page.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/offline_scenario.png b/docs/zh/app_ecosystem/feat_insight/images/offline_scenario.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/offline_scenario.png rename to docs/zh/app_ecosystem/feat_insight/images/offline_scenario.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/online_csv_import_test_table.png b/docs/zh/app_ecosystem/feat_insight/images/online_csv_import_test_table.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/online_csv_import_test_table.png rename to docs/zh/app_ecosystem/feat_insight/images/online_csv_import_test_table.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/online_scenario.png b/docs/zh/app_ecosystem/feat_insight/images/online_scenario.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/online_scenario.png rename to docs/zh/app_ecosystem/feat_insight/images/online_scenario.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/preview_test_features.png b/docs/zh/app_ecosystem/feat_insight/images/preview_test_features.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/preview_test_features.png rename to docs/zh/app_ecosystem/feat_insight/images/preview_test_features.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/preview_test_table.png b/docs/zh/app_ecosystem/feat_insight/images/preview_test_table.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/preview_test_table.png rename to docs/zh/app_ecosystem/feat_insight/images/preview_test_table.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/request_feature_service.png b/docs/zh/app_ecosystem/feat_insight/images/request_feature_service.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/request_feature_service.png rename to docs/zh/app_ecosystem/feat_insight/images/request_feature_service.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/request_test_feature_service.png b/docs/zh/app_ecosystem/feat_insight/images/request_test_feature_service.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/request_test_feature_service.png rename to docs/zh/app_ecosystem/feat_insight/images/request_test_feature_service.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/sql_playground_fail.png b/docs/zh/app_ecosystem/feat_insight/images/sql_playground_fail.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/sql_playground_fail.png rename to docs/zh/app_ecosystem/feat_insight/images/sql_playground_fail.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/sql_playground_offline.png b/docs/zh/app_ecosystem/feat_insight/images/sql_playground_offline.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/sql_playground_offline.png rename to docs/zh/app_ecosystem/feat_insight/images/sql_playground_offline.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/sql_playground_online.png b/docs/zh/app_ecosystem/feat_insight/images/sql_playground_online.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/sql_playground_online.png rename to docs/zh/app_ecosystem/feat_insight/images/sql_playground_online.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/sql_tool_entry.png b/docs/zh/app_ecosystem/feat_insight/images/sql_tool_entry.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/sql_tool_entry.png rename to docs/zh/app_ecosystem/feat_insight/images/sql_tool_entry.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/sql_tool_tutorial.png b/docs/zh/app_ecosystem/feat_insight/images/sql_tool_tutorial.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/sql_tool_tutorial.png rename to docs/zh/app_ecosystem/feat_insight/images/sql_tool_tutorial.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/table_detail.png b/docs/zh/app_ecosystem/feat_insight/images/table_detail.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/table_detail.png rename to docs/zh/app_ecosystem/feat_insight/images/table_detail.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/table_list.png b/docs/zh/app_ecosystem/feat_insight/images/table_list.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/table_list.png rename to docs/zh/app_ecosystem/feat_insight/images/table_list.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/tables_page.png b/docs/zh/app_ecosystem/feat_insight/images/tables_page.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/tables_page.png rename to docs/zh/app_ecosystem/feat_insight/images/tables_page.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/test_feature_service_detail.png b/docs/zh/app_ecosystem/feat_insight/images/test_feature_service_detail.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/test_feature_service_detail.png rename to docs/zh/app_ecosystem/feat_insight/images/test_feature_service_detail.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/test_features_list.png b/docs/zh/app_ecosystem/feat_insight/images/test_features_list.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/test_features_list.png rename to docs/zh/app_ecosystem/feat_insight/images/test_features_list.png diff --git a/docs/zh/app_ecosystem/feature_platform/images/test_offline_sample_detail.png b/docs/zh/app_ecosystem/feat_insight/images/test_offline_sample_detail.png similarity index 100% rename from docs/zh/app_ecosystem/feature_platform/images/test_offline_sample_detail.png rename to docs/zh/app_ecosystem/feat_insight/images/test_offline_sample_detail.png diff --git a/docs/zh/app_ecosystem/feature_platform/index.rst b/docs/zh/app_ecosystem/feat_insight/index.rst similarity index 80% rename from docs/zh/app_ecosystem/feature_platform/index.rst rename to docs/zh/app_ecosystem/feat_insight/index.rst index 393e9e4dd68..296bfd07586 100644 --- a/docs/zh/app_ecosystem/feature_platform/index.rst +++ b/docs/zh/app_ecosystem/feat_insight/index.rst @@ -1,5 +1,5 @@ ============================= -OpenMLDB 特征平台 +FeatInsight ============================= .. toctree:: @@ -9,8 +9,5 @@ OpenMLDB 特征平台 quickstart install/index functions/index - - - - - \ No newline at end of file + use_cases/index + faq diff --git a/docs/zh/app_ecosystem/feature_platform/install/config_file.md b/docs/zh/app_ecosystem/feat_insight/install/config_file.md similarity index 82% rename from docs/zh/app_ecosystem/feature_platform/install/config_file.md rename to docs/zh/app_ecosystem/feat_insight/install/config_file.md index fc66ef07a64..17d8539540d 100644 --- a/docs/zh/app_ecosystem/feature_platform/install/config_file.md +++ b/docs/zh/app_ecosystem/feat_insight/install/config_file.md @@ -1,8 +1,8 @@ -# 特征平台配置文件 +# FeatInsight 配置文件 ## 介绍 -OpenMLDB 特征平台基于 Spring Boot 开发,使用 `application.yml` 规范作为配置文件。 +FeatInsight 基于 Spring Boot 开发,使用 `application.yml` 规范作为配置文件。 ## 配置示例 diff --git a/docs/zh/app_ecosystem/feat_insight/install/docker.md b/docs/zh/app_ecosystem/feat_insight/install/docker.md new file mode 100644 index 00000000000..3625d61bd47 --- /dev/null +++ b/docs/zh/app_ecosystem/feat_insight/install/docker.md @@ -0,0 +1,46 @@ +# Docker + +## 介绍 + +使用官方构建好的 Docker 镜像, 可以快速部署 OpenMLDB 特征服务. + +## 内置 OpenMLDB 镜像 + +使用内置 OpenMLDB 的镜像,可以一键启动 OpenMLDB 集群和 OpenMLDB 特征服务,无需额外部署即可使用特征服务。 + +``` +docker run -d -p 8888:8888 registry.cn-shenzhen.aliyuncs.com/tobe43/portable-openmldb +``` + +启动 OpenMLDB 和 FeatInsight 需要约一分钟,可通过 `docker logs` 查看日志,启动成功后在本地浏览器打开 `http://127.0.0.1:8888` 即可访问 FeatInsight 服务。 + + +## 不包含 OpenMLDB 镜像 + +使用不包含 OpenMLDB 的镜像,需要提前部署 OpenMLDB 集群,然后启动 OpenMLDB 特征服务容器,部署步骤较繁琐但灵活性高。 + +首先参考 [OpenMLDB 部署文档](../../../deploy/index.rst) 提前部署 OpenMLDB 集群。 + +然后参考 [FeatInsight 配置文件](./config_file.md),创建 `application.yml` 配置文件。 + +``` +server: + port: 8888 + +openmldb: + zk_cluster: 127.0.0.1:2181 + zk_path: /openmldb + apiserver: 127.0.0.1:9080 +``` + +对于 Linux 操作系统可以使用下面命令启动 FeatInsight 容器. + +``` +docker run -d -p 8888:8888 --net=host -v `pwd`/application.yml:/app/application.yml registry.cn-shenzhen.aliyuncs.com/tobe43/featinsight +``` + +由于 MacOS 通过虚拟机启动 Docker 容器,使用 `--net=host` 参数无法正常工作,需要提前修改配置文件指向正确的 OpenMLDB 服务。 + +``` +docker run -d -p 8888:8888 -v `pwd`/application.yml:/app/application.yml registry.cn-shenzhen.aliyuncs.com/tobe43/featinsight +``` diff --git a/docs/zh/app_ecosystem/feature_platform/install/index.rst b/docs/zh/app_ecosystem/feat_insight/install/index.rst similarity index 92% rename from docs/zh/app_ecosystem/feature_platform/install/index.rst rename to docs/zh/app_ecosystem/feat_insight/install/index.rst index b96f6c7df17..9c23f1f778d 100644 --- a/docs/zh/app_ecosystem/feature_platform/install/index.rst +++ b/docs/zh/app_ecosystem/feat_insight/install/index.rst @@ -5,8 +5,8 @@ .. toctree:: :maxdepth: 1 - package docker + package source config_file upgrade diff --git a/docs/zh/app_ecosystem/feature_platform/install/package.md b/docs/zh/app_ecosystem/feat_insight/install/package.md similarity index 54% rename from docs/zh/app_ecosystem/feature_platform/install/package.md rename to docs/zh/app_ecosystem/feat_insight/install/package.md index 9ded57515b3..73b9188104d 100644 --- a/docs/zh/app_ecosystem/feature_platform/install/package.md +++ b/docs/zh/app_ecosystem/feat_insight/install/package.md @@ -2,7 +2,7 @@ ## 介绍 -使用官方预编译的安装包,只需要本地有 Java 环境就可以快速部署 OpenMLDB 特征平台。 +使用官方预编译的安装包,只需要本地有 Java 环境就可以快速部署 FeatInsight 服务。 注意,需参考 [OpenMLDB 部署文档](../../../deploy/index.rst) 提前部署 OpenMLDB 集群。 @@ -11,12 +11,12 @@ 下载 Jar 文件。 ``` -wget https://openmldb.ai/download/feature-platform/openmldb-feature-platform-0.1-SNAPSHOT.jar +wget https://openmldb.ai/download/featinsight/featinsight-0.1.0-SNAPSHOT.jar ``` ## 配置 -参考[特征平台配置文件](./config_file.md),创建 `application.yml` 配置文件。 +参考 [FeatInsight 配置文件](./config_file.md),创建 `application.yml` 配置文件。 ``` server: @@ -30,9 +30,9 @@ openmldb: ## 启动 -启动特征平台服务。 +启动 FeatInsight 服务。 ``` -java -jar ./openmldb-feature-platform-0.1-SNAPSHOT.jar +java -jar ./featinsight-0.1.0-SNAPSHOT.jar ``` diff --git a/docs/zh/app_ecosystem/feature_platform/install/source.md b/docs/zh/app_ecosystem/feat_insight/install/source.md similarity index 71% rename from docs/zh/app_ecosystem/feature_platform/install/source.md rename to docs/zh/app_ecosystem/feat_insight/install/source.md index 5195086e318..875d843817d 100644 --- a/docs/zh/app_ecosystem/feature_platform/install/source.md +++ b/docs/zh/app_ecosystem/feat_insight/install/source.md @@ -2,14 +2,14 @@ ## 介绍 -通过源码编译 OpenMLDB 特征平台,可以按需使用特定源码功能,本文档提供源码编译的完整流程。 +通过源码编译 FeatInsight 项目,可以按需使用特定源码功能,本文档提供源码编译的完整流程。 ## 下载源码 下载项目源码。 ``` -git clone https://github.com/4paradigm/feature-platform +git clone https://github.com/4paradigm/FeatInsight ``` ## 编译源码 @@ -17,7 +17,7 @@ git clone https://github.com/4paradigm/feature-platform 进入项目根目录,执行以下命令编译前端和后端代码。 ``` -cd ./feature-platform/frontend/ +cd ./FeatInsight/frontend/ npm run build cd ../ diff --git a/docs/zh/app_ecosystem/feature_platform/install/upgrade.md b/docs/zh/app_ecosystem/feat_insight/install/upgrade.md similarity index 57% rename from docs/zh/app_ecosystem/feature_platform/install/upgrade.md rename to docs/zh/app_ecosystem/feat_insight/install/upgrade.md index d93951778bf..1155258a3ec 100644 --- a/docs/zh/app_ecosystem/feature_platform/install/upgrade.md +++ b/docs/zh/app_ecosystem/feat_insight/install/upgrade.md @@ -2,7 +2,7 @@ ## 介绍 -OpenMLDB 特征平台对外提供 HTTP 接口,底层依赖 OpenMLDB 数据库存储元数据,因此可以通过多实例和 Rolling update 等方法进行版本升级。 +FeatInsight 对外提供 HTTP 接口,底层依赖 OpenMLDB 数据库存储元数据,因此可以通过多实例和 Rolling update 等方法进行版本升级。 ## 单实例升级步骤 diff --git a/docs/zh/app_ecosystem/feature_platform/introduction.md b/docs/zh/app_ecosystem/feat_insight/introduction.md similarity index 68% rename from docs/zh/app_ecosystem/feature_platform/introduction.md rename to docs/zh/app_ecosystem/feat_insight/introduction.md index ab779006e76..49f477d6f41 100644 --- a/docs/zh/app_ecosystem/feature_platform/introduction.md +++ b/docs/zh/app_ecosystem/feat_insight/introduction.md @@ -1,24 +1,25 @@ # 简介 -OpenMLDB 特征平台是一个先进的特征存储(Feature Store)服务,基于 [OpenMLDB](https://github.com/4paradigm/OpenMLDB) 数据库实现高效的特征管理和编排功能。 +FeatInsight 是一个先进的特征存储(Feature Store)服务,基于 [OpenMLDB](https://github.com/4paradigm/OpenMLDB) 数据库实现高效的特征管理和编排功能。 -特征平台提供简便易用的 UI 界面,用户可以进行机器学习特征开发的全流程,包括数据的导入、查看、编辑,特征的生成、存储、上线等功能。 针对离线场景中,用户可以选择特征生成离线样本用于后续的机器学习开发;针对在线场景中,用户可以选择特征创建特征服务,实现实时特征计算。 +FeatInsight 提供简便易用的 UI 界面,用户可以进行机器学习特征开发的全流程,包括数据的导入、查看、编辑,特征的生成、存储、上线等功能。 针对离线场景中,用户可以选择特征生成离线样本用于后续的机器学习开发;针对在线场景中,用户可以选择特征创建特征服务,实现实时特征计算。 ![](./images/bigscreen.png) ## 主要功能 -OpenMLDB 特征平台包括以下几个主要功能: +FeatInsight 包括以下几个主要功能: - [数据管理](./functions/import_data.md):用于导入和管理特征工程需要的原始数据和在线数据。 - [特征管理](./functions/manage_feature.md):用于存储原始特征数据和派生特征数据的存储系统。 - [在线场景](./functions/online_scenario.md):上线特征服务,使用在线数据提供硬实时的在线特征抽取接口。 - [离线场景](./functions/offline_scenario.md):对离线数据进行特征计算并导出样本文件,提供离线样本、任务管理功能。 - +- [SQL实验室](./functions/sql_playground.md):可调试和执行任意的 OpenMLDB SQL 语句,使用在线模式或离线模型完成特征计算任务。 +- [预计算特征](./functions/computed_features.md):用户可以通过预计算把特征值直接存入 OpenMLDB 在线表中,然后访问在线表数据进行读写特征。 ## 核心特性 -OpenMLDB 特征平台的主要目的是解决在机器学习项目中常见的问题,包括简便快捷地进行特征提取、转换、组合、选择以及血缘管理,特征的重用和共享,特征服务版本控制,以及确保在训练和推理过程中使用的特征数据的一致和可靠。一些特征平台的范例应用场景包括: +FeatInsight 的主要目的是解决在机器学习项目中常见的问题,包括简便快捷地进行特征提取、转换、组合、选择以及血缘管理,特征的重用和共享,特征服务版本控制,以及确保在训练和推理过程中使用的特征数据的一致和可靠。一些 FeatInsight 的范例应用场景包括: * 上线在线特征服务:提供本地化部署的高性能特征存储和在线特征计算功能。 * 搭建 MLOps 平台:基于 OpenMLDB 在线离线一致性快速实现完成的 MLOps 工作流。 @@ -28,7 +29,7 @@ OpenMLDB 特征平台的主要目的是解决在机器学习项目中常见的 ## 核心概念 -以下是特征平台所使用到的一些术语及其定义,以方便理解: +以下是 FeatInsight 所使用到的一些术语及其定义,以方便理解: * 特征:通过对原始数据进行特征抽取得到的可直接用于模型训练和推理的数据。 * 预计算特征:通过外部批计算或流式处理后存储的特征值,可直接上线使用。 diff --git a/docs/zh/app_ecosystem/feature_platform/quickstart.md b/docs/zh/app_ecosystem/feat_insight/quickstart.md similarity index 78% rename from docs/zh/app_ecosystem/feature_platform/quickstart.md rename to docs/zh/app_ecosystem/feat_insight/quickstart.md index 31f9167d488..88a255b198c 100644 --- a/docs/zh/app_ecosystem/feature_platform/quickstart.md +++ b/docs/zh/app_ecosystem/feat_insight/quickstart.md @@ -1,15 +1,15 @@ # 快速入门 -本文将介绍如何快速入门 OpenMLDB 特征平台,基于一个 SQL 示例来演示如何使用特征平台。 +本文将介绍如何快速入门 FeatInsight,基于一个 SQL 示例来演示如何使用。 -安装部署可参考 [OpenMLDB 部署文档](../../../deploy/index.rst) 和 [OpenMLDB 特征平台部署文档](./install/index.rst)。 +安装部署可参考 [OpenMLDB 部署文档](../../../deploy/index.rst) 和 [FeatInsight 部署文档](./install/index.rst)。 ## 使用流程 -特征平台的基本使用流程包括以下几个步骤: +FeatInsight 的基本使用流程包括以下几个步骤: 1. 导入数据:使用SQL命令或前端表单进行创建数据库、创建数据表、导入在线数据和导入离线数据等操作。 -2. 创建特征:使用SQL语句来定义特征视图,特征平台使用SQL编译器进行特征分析并创建对应的特征。 +2. 创建特征:使用SQL语句来定义特征视图,FeatInsight 使用SQL编译器进行特征分析并创建对应的特征。 3. 离线场景:选择想要导入的特征,可以同时选择不同特征视图的特征,并使用分布式计算把样本文件导入到本地或分布式存储。 4. 在线场景:选择想要上线的特征,一键发布成在线特征抽取服务,然后可使用HTTP客户端进行请求和返回在线特征抽取结果。 @@ -23,11 +23,11 @@ CREATE DATABASE test_db; CREATE TABLE test_db.test_table (id STRING, trx_time DATE); ``` -也可以在特征平台的“数据导入”前端页面直接创建。 +也可以在 FeatInsight 的“数据导入”前端页面直接创建。 ![](./images/create_test_table.png) -为了测试方便,我们准备一个 CSV 文件并保存到 `/tmp/test_table.csv`。注意,这里本地是 OpenMLDB TaskManager 服务器的本地路径,一般也是 OpenMLDB 特征平台的服务器路径,需要提前登陆编辑。 +为了测试方便,我们准备一个 CSV 文件并保存到 `/tmp/test_table.csv`。注意,这里本地是 OpenMLDB TaskManager 服务器的本地路径,一般也是 FeatInsight 的服务器路径,需要提前登陆编辑。 ``` id,trx_time @@ -86,7 +86,7 @@ SELECT id, dayofweek(trx_time) as trx_day FROM test_table ![](./images/test_offline_sample_detail.png) -在本地即可查看导出的样本文件内容。为了验证 OpenMLDB 特征平台提供的在线离线一致性,可记录离线特征结果,并于后面的在线特征计算做比较。 +在本地即可查看导出的样本文件内容。为了验证 FeatInsight 提供的在线离线一致性,可记录离线特征结果,并于后面的在线特征计算做比较。 ![](./images/local_test_offline_samples.png) @@ -106,11 +106,11 @@ SELECT id, dayofweek(trx_time) as trx_day FROM test_table ## 总结 -本示例演示了使用 OpenMLDB 特征平台的完整流程,通过编写简单的 SQL 即可实现在线和离线的特征定义,通过选择不同的特征,甚至是组合不同特征组的特征,即可实现快速的特征复用和上线,并且对比离线和在线的计算结果验证了特征计算的一致性。 +本示例演示了使用 FeatInsight 的完整流程,通过编写简单的 SQL 即可实现在线和离线的特征定义,通过选择不同的特征,甚至是组合不同特征组的特征,即可实现快速的特征复用和上线,并且对比离线和在线的计算结果验证了特征计算的一致性。 ## 附录:高级功能 -除了特征工程的基本功能之外,特征平台还提供了高级功能以方便用户进行特征工程的开发: +除了特征工程的基本功能之外,FeatInsight 还提供了高级功能以方便用户进行特征工程的开发: * SQL 实验室:提供了 OpenMLDB SQL 语句的调试和执行功能,方便用户执行任意 SQL 操作并调试特征抽取的 SQL 语句。详情请见[这里](./functions/sql_playground)。 * 预计算特征:可以将通过外部批计算或流式处理后得到的特征值直接存入OpenMLDB在线表中,然后访问在线表数据进行读写特征。详情请见[这里](./functions/computed_features)。 \ No newline at end of file diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_create_feature.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_create_feature.png new file mode 100644 index 00000000000..ad6b53b0ab3 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_create_feature.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_create_feature_service.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_create_feature_service.png new file mode 100644 index 00000000000..83a6d260ddd Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_create_feature_service.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_create_tables.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_create_tables.png new file mode 100644 index 00000000000..156ad25b474 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_create_tables.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_feature_view_detail.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_feature_view_detail.png new file mode 100644 index 00000000000..2cdd1bcb4a2 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_feature_view_detail.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_request_feature_service.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_request_feature_service.png new file mode 100644 index 00000000000..e731ee79a5a Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_request_feature_service.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_create_feature.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_create_feature.png new file mode 100644 index 00000000000..a01520f8464 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_create_feature.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_create_feature_service.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_create_feature_service.png new file mode 100644 index 00000000000..8296a3907e1 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_create_feature_service.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_create_table.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_create_table.png new file mode 100644 index 00000000000..86a098c491f Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_create_table.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_export_offline_samples.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_export_offline_samples.png new file mode 100644 index 00000000000..13ae5329408 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_export_offline_samples.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_feature_service_detail.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_feature_service_detail.png new file mode 100644 index 00000000000..79426e410b2 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_feature_service_detail.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_features.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_features.png new file mode 100644 index 00000000000..aeb439453c2 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_features.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_import_offline_data.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_import_offline_data.png new file mode 100644 index 00000000000..0b69b9cb00a Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_import_offline_data.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_import_online_data.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_import_online_data.png new file mode 100644 index 00000000000..9d8fb565873 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_import_online_data.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_offline_samples_data.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_offline_samples_data.png new file mode 100644 index 00000000000..a02f861f0ec Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_offline_samples_data.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_preview_online_table.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_preview_online_table.png new file mode 100644 index 00000000000..ed086f6fb9b Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_preview_online_table.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_request_feature_service.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_request_feature_service.png new file mode 100644 index 00000000000..0a72b79170b Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_request_feature_service.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/index.rst b/docs/zh/app_ecosystem/feat_insight/use_cases/index.rst new file mode 100644 index 00000000000..cae20e85faa --- /dev/null +++ b/docs/zh/app_ecosystem/feat_insight/use_cases/index.rst @@ -0,0 +1,9 @@ +============================= +应用案例 +============================= + +.. toctree:: + :maxdepth: 1 + + taxi_tour_duration_prediction + recommend_system \ No newline at end of file diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/recommend_system.md b/docs/zh/app_ecosystem/feat_insight/use_cases/recommend_system.md new file mode 100644 index 00000000000..1d071e5c34e --- /dev/null +++ b/docs/zh/app_ecosystem/feat_insight/use_cases/recommend_system.md @@ -0,0 +1,113 @@ +# 电商推荐系统物料统计场景 + +## 场景介绍 + +在常见的电商推荐系统中,需对每次推荐请求前的特定时间段内(近7天),用户对各类标签广告的浏览次数进行精确统计,这些统计数据将被反馈给推荐系统,以便进行更深入的规则分析和判断。 + +## 场景数据 + +这里准备3张数据表,首先是请求数据表,用户通过 ID 以及请求时间查询当前窗口所需的特征。 + +``` +CREATE TABLE recommend_system.request (uid string, event_time timestamp) +``` + +然后是曝光表,需要提供用户 ID 以及物料 ID 信息,为了简化把其他无关的列都去掉。 + +``` +CREATE TABLE recommend_system.feeds (uid string, material_id string, event_time timestamp) +``` + +最后是物料表,主要包含物料基本信息,包括本场景需要统计的物料类型等,同样简化把无关的字段先去掉。 + +``` +CREATE TABLE recommend_system.material (material_id string, tag string); +``` + +## 特征设计 + +根据场景的背景描述,只需要提取用户 ID 以及物料的不同标签出现的次数即可,使用以下的 OpenMLDB SQL 进行特征抽取。 + +``` +SELECT + uid, + count_cate(material_id, tag) OVER w AS category_count +FROM + (SELECT uid, CAST (null AS string) AS material_id, CAST (null AS string) AS tag, event_time FROM request) +WINDOW + w AS ( + UNION ( + SELECT + uid, feeds.material_id, material.tag AS tag, event_time + FROM feeds + LAST JOIN material ON feeds.material_id = material.material_id) + PARTITION BY uid ORDER BY event_time ROWS_RANGE BETWEEN 7d PRECEDING AND CURRENT ROW) +``` + +可以参考下面的逻辑来理解 SQL 语句的含义: + +1. 将曝光表与物料表进行 Join 操作,这样拼接后的表就可以获得物料的标签类型等需要的属性。 +2. 对请求表进行拓展,增加 material_id 和 tag 列并使用 null 值填充,这样方便后续与第一步的输出表进行 Union 操作。 +3. 使用 Window Union 将第一步和第二步的表进行 Union 操作,这样就得到了一个完整的表,然后基于这个完整表进行窗口操作和查询操作。注意,这里使用 Window Union 而不是 Join + Window 是为了避免 Left Join 可能一行数据产生多行样本,而使用 Last Join 则可能导致副表只能拼接一行数据。 +4. 最后使用 count_cate 函数对物料标签进行计数,得到特征。 + +## 实现流程 + +### 1. 数据导入 + +首先创建数据库和数据表,为了方便上线这里把索引也提前加上了。 + +``` +CREATE DATABASE recommend_system; + +CREATE TABLE recommend_system.request (uid string, event_time timestamp, INDEX(key=uid, TS=event_time)); + +CREATE TABLE recommend_system.feeds (uid string, material_id string, event_time timestamp, INDEX(key=uid, TS=event_time)); + +CREATE TABLE recommend_system.material (material_id string, tag string); +``` + +因为实际数据需要脱敏,用户可以根据实际情况进行测试数据的导入,本文只演示特征上线流程。 + +### 2. 定义特征 + +使用前面介绍的 SQL 语句定义特征。 + +``` +SELECT + uid, + count_cate(material_id, tag) OVER w AS category_count +FROM + (SELECT uid, CAST (null AS string) AS material_id, CAST (null AS string) AS tag, event_time FROM request) +WINDOW + w AS ( + UNION ( + SELECT + uid, feeds.material_id, material.tag AS tag, event_time + FROM feeds + LAST JOIN material ON feeds.material_id = material.material_id) + PARTITION BY uid ORDER BY event_time ROWS_RANGE BETWEEN 7d PRECEDING AND CURRENT ROW) +``` + +在前端页面创建特征,并自动分析出需要创建的两个特征。 + +![](./images/recommend_create_feature.png) + +创建成功后可以通过特征视图查看详情。 + +![](./images/recommend_feature_view_detail.png) + +### 3. 特征上线 + +在在线场景页面,选择需要上线的特征,并确认创建。 + +![](./images/recommend_create_feature_service.png) + +特征服务上线成功后,就可以通过输入请求数据进行在线请求测试了。 + +![](./images/recommend_request_feature_service.png) + +## 总结 + +对于推荐系统模型来说,特征工程是非常重要的一环,FeatInsight 提供了一个简单快速的特征管理和特征上线流程,帮助用户快速上线特征,提升推荐系统的效果,对于更复杂的特征也都可以使用 SQL 来描述和上线。 + diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/taxi_tour_duration_prediction.md b/docs/zh/app_ecosystem/feat_insight/use_cases/taxi_tour_duration_prediction.md new file mode 100644 index 00000000000..8a948c368ed --- /dev/null +++ b/docs/zh/app_ecosystem/feat_insight/use_cases/taxi_tour_duration_prediction.md @@ -0,0 +1,104 @@ +# 出租车行程时间预测场景 + +## 场景介绍 + +场景来自 Kaggle 的 [New York City Taxi Trip Duration](https://www.kaggle.com/c/nyc-taxi-trip-duration/overview), 对纽约市出租车公司的行程时间进行预测,预测的输入为出发地经纬度、目的地经纬度、出发时间、天气情况等,需要抽取特征最终预测出行程时间。 + +## 特征设计 + +特征设计参考 [出租车行程时间预测 (OpenMLDB + LightGBM)](../../../use_case/taxi_tour_duration_prediction.md),使用下面的 OpenMLDB SQL 进行特征工程和数据导出。 + +``` +SELECT + trip_duration, + passenger_count, + sum(pickup_latitude) OVER w AS vendor_sum_pl, + max(pickup_latitude) OVER w AS vendor_max_pl, + min(pickup_latitude) OVER w AS vendor_min_pl, + avg(pickup_latitude) OVER w AS vendor_avg_pl, + sum(pickup_latitude) OVER w2 AS pc_sum_pl, + max(pickup_latitude) OVER w2 AS pc_max_pl, + min(pickup_latitude) OVER w2 AS pc_min_pl, + avg(pickup_latitude) OVER w2 AS pc_avg_pl, + count(vendor_id) OVER w2 AS pc_cnt, + count(vendor_id) OVER w AS vendor_cnt +FROM t1 +WINDOW + w AS (PARTITION BY vendor_id ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW), + w2 AS (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW) +``` + +## 实现流程 + +### 1. 数据导入 + +创建测试数据库 `taxi_trip_duration` 和测试数据表 `t1`。 + +``` +CREATE DATABASE taxi_trip_duration; + +CREATE TABLE taxi_trip_duration.t1 (id string, vendor_id int, pickup_datetime timestamp, dropoff_datetime timestamp, passenger_count int, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, store_and_fwd_flag string, trip_duration int); +``` + +![](./images/taxi_create_table.png) + +注意,在 OpenMLDB 0.8.4及前序版本不支持自动创建索引,因此需要在创建表时添加索引。 + +``` +CREATE TABLE taxi_trip_duration.t1(id string, vendor_id int, pickup_datetime timestamp, dropoff_datetime timestamp, passenger_count int, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, store_and_fwd_flag string, trip_duration int, INDEX(KEY=vendor_id, TS=pickup_datetime), INDEX(KEY=passenger_count, TS=pickup_datetime)); +``` + +然后从 Kaggle 下载数据集进行导入数据,下载命令如下。 + +``` +kaggle competitions download -c nyc-taxi-trip-duration +``` + +下载后解压得到 `train.csv` 文件,放在 `/tmp/train.csv` 路径下,在前端页面选择“使用 CSV 导入”在线数据。 + +![](./images/taxi_import_online_data.png) + +导入成功后,可以预览在线表数据。 + +![](./images/taxi_preview_online_table.png) + +然后进行离线的数据导入,同样在前端页面选择“使用 CSV 导入”操作即可。 + +![](./images/taxi_import_offline_data.png) + +### 2. 创建特征 + +根据前面设计的 SQL 语句,我们选择创建一个特征组,创建时会”分析 SQL“,并且根据 SQL 自动分析出创建的特征列表。 + +![](./images/taxi_create_feature.png) + +![](./images/taxi_features.png) + +### 3. 离线场景 + +在离线场景,我们选择刚生成的特征视图的所有特征,把离线样本导出到本地进行模型训练。 + +![](./images/taxi_export_offline_samples.png) + +离线任务执行成功后,可以查看本地路径 `/tmp/taxi_tour_features/`,发现特征数据已经计算出来,并且导出到本地可以直接给模型训练使用。模型训练可参考[出租车行程时间预测 (OpenMLDB + LightGBM)](../../../use_case/taxi_tour_duration_prediction.md)。 + +![](./images/taxi_offline_samples_data.png) + +### 4. 在线场景 + +通过离线场景验证特征 SQL 正确后,可通过在线场景把特征上线成特征服务。 + +![](./images/taxi_create_feature_service.png) + +创建成功后,可以查看特征服务的详情页面。 + +![](./images/taxi_feature_service_detail.png) + +最后可以在请求页面进行在线测试,并且验证在线离线特征结果是否一致。 + +![](./images/taxi_request_feature_service.png) + +## 总结 + +使用 FeatInsight 实现出租车行程时间预测场景,整个过程非常简单且步骤清晰,相比于使用 OpenMLDB 命令行工具更加直观,而且只需要有浏览器就可以操作,免去科学家搭建环境的麻烦,在线调试特征以及特征复用也更加简单。 + diff --git a/docs/zh/app_ecosystem/feature_platform/install/docker.md b/docs/zh/app_ecosystem/feature_platform/install/docker.md deleted file mode 100644 index b17a2630eef..00000000000 --- a/docs/zh/app_ecosystem/feature_platform/install/docker.md +++ /dev/null @@ -1,37 +0,0 @@ -# Docker - -## 介绍 - -使用构建好的 Docker 镜像, 可以快速启动 OpenMLDB 特征服务. - -## 配置 - -参考[特征平台配置文件](./config_file.md),创建 `application.yml` 配置文件。 - -``` -server: - port: 8888 - -openmldb: - zk_cluster: 127.0.0.1:2181 - zk_path: /openmldb - apiserver: 127.0.0.1:9080 -``` - -## Linux - -参考 [OpenMLDB 部署文档](../../../deploy/index.rst) 提前部署 OpenMLDB 集群。 - -启动 OpenMLDB 特征平台 容器. - -``` -docker run -d -p 8888:8888 --net=host -v `pwd`/application.yml:/app/application.yml registry.cn-shenzhen.aliyuncs.com/tobe43/openmldb-feature-platform -``` - -## MacOS - -由于 MacOS 通过虚拟机启动 Docker 容器,使用 `--net=host` 参数无法正常工作,需要提前修改配置文件指向正确的 OpenMLDB 服务。 - -``` -docker run -d -p 8888:8888 -v `pwd`/application.yml:/app/application.yml registry.cn-shenzhen.aliyuncs.com/tobe43/openmldb-feature-platform -``` diff --git a/docs/zh/deploy/install_deploy.md b/docs/zh/deploy/install_deploy.md index 9b7a67fa857..0f32ba24b00 100644 --- a/docs/zh/deploy/install_deploy.md +++ b/docs/zh/deploy/install_deploy.md @@ -155,6 +155,7 @@ OpenMLDB 提供了两种启动模式:普通和守护进程启动。守护进 如果想要使守护进程模式启动,请使用`bash bin/start.sh start mon`或者`sbin/start-all.sh mon`的方式启动。守护进程模式中,`bin/.pid`将是 mon 进程的 pid,`bin/.pid.child` 为组件真实的 pid。 ## 部署方式一:一键部署(推荐) + OpenMLDB集群版需要部署ZooKeeper、NameServer、TabletServer、TaskManager等模块。其中ZooKeeper用于服务发现和保存元数据信息。NameServer用于管理TabletServer,实现高可用和failover。TabletServer用于存储数据和主从同步数据。APIServer是可选的,如果要用http的方式和OpenMLDB交互需要部署此模块。TaskManager 用于管理离线 job。我们提供了一键部署脚本,可以简化手动在每台机器上下载和配置的复杂性。 **注意:** 同一台机器部署多个组件时,一定要部署在不同的目录里,便于单独管理。尤其是部署TabletServer,一定不能重复使用目录,避免数据文件和日志文件冲突。 @@ -164,9 +165,9 @@ DataCollector和SyncTool暂不支持一键部署。请参考手动部署方式 ### 环境要求 - 部署机器(执行部署脚本的机器)可以免密登录其他部署节点 -- 部署机器安装 `rsync` 工具 -- 部署机器安装 Python3 -- 部署Zookeeper和TaskManager的机器安装 JRE (Java Runtime Environment) +- 部署机器需安装 `rsync` 工具 +- 部署机器需安装 Python3 +- Zookeeper和TaskManager的运行机器上需安装 JRE (Java Runtime Environment) ### 下载OpenMLDB发行版 @@ -176,28 +177,55 @@ tar -zxvf openmldb-0.8.4-linux.tar.gz cd openmldb-0.8.4-linux ``` +### 脚本使用逻辑 + +部署脚本均在sbin中,我们也称一键部署为sbin部署。初次部署过程一般是“修改环境和配置文件 -> sbin/deploy-all.sh -> sbin/start-all.sh”。如果需要停止服务,执行`sbin/stop-all.sh`。清理已部署的数据和日志,执行`sbin/clear-all.sh`。Docker镜像中的`/work/init.sh`脚本便是进行“deploy-all -> stop-all -> clear-all -> start-all”。 + +如果集群正在运行,需要修改配置(不能只deploy到单台,但全部覆盖配置不影响进程运行)并重启某一个组件(不能指定单进程,但可以指定组件),需要“修改配置 -> deploy-all.sh -> stop-tablets.sh -> start-tablets.sh”。但需要注意重启tablet可能会导致数据加载失败(影响服务),需要进行集群诊断与恢复,可使用[一键inspect](../maintain/diagnose.md#一键inspect)。数据量较大或不可出现服务中断时,更推荐使用扩缩容方式或手动重启单进程。 + ### 环境配置 -环境变量定义在`conf/openmldb-env.sh`,如下表所示: - -| 环境变量 | 默认值 | 定义 | -|-----------------------------------|------------------------------------|-------------------------------------------------------------------------| -| OPENMLDB_VERSION | 0.8.4 | OpenMLDB版本 | -| OPENMLDB_MODE | standalone | standalone或者cluster | -| OPENMLDB_HOME | 当前发行版的根目录 | openmldb发行版根目录 | -| SPARK_HOME | $OPENMLDB_HOME/spark | openmldb spark发行版根目录,如果该目录不存在,自动从网上下载 | -| OPENMLDB_TABLET_PORT | 10921 | TabletServer默认端口 | -| OPENMLDB_NAMESERVER_PORT | 7527 | NameServer默认端口 | -| OPENMLDB_TASKMANAGER_PORT | 9902 | taskmanager默认端口 | -| OPENMLDB_APISERVER_PORT | 9080 | APIServer默认端口 | -| OPENMLDB_USE_EXISTING_ZK_CLUSTER | false | 是否使用已经部署的ZooKeeper集群。如果是`false`,会在部署脚本里自动启动ZooKeeper集群 | -| OPENMLDB_ZK_HOME | $OPENMLDB_HOME/zookeeper | ZooKeeper发行版根目录 | -| OPENMLDB_ZK_CLUSTER | 自动从`conf/hosts`中的`[zookeeper]`配置获取 | ZooKeeper集群地址 | -| OPENMLDB_ZK_ROOT_PATH | /openmldb | OpenMLDB在ZooKeeper的根目录 | -| OPENMLDB_ZK_CLUSTER_CLIENT_PORT | 2181 | ZooKeeper client port, 即zoo.cfg里面的clientPort | -| OPENMLDB_ZK_CLUSTER_PEER_PORT | 2888 | ZooKeeper peer port,即zoo.cfg里面这种配置server.1=zoo1:2888:3888中的第一个端口配置 | -| OPENMLDB_ZK_CLUSTER_ELECTION_PORT | 3888 | ZooKeeper election port, 即zoo.cfg里面这种配置server.1=zoo1:2888:3888中的第二个端口配置 | + +环境变量定义在`conf/openmldb-env.sh`,主要变量如下表所示: + +| 环境变量 | 默认值 | 定义 | +| -------------------------------- | ------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------- | +| OPENMLDB_VERSION | 0.8.4 | OpenMLDB版本,主要用于spark下载,一般不改动。 | +| OPENMLDB_MODE | cluster | standalone或者cluster | +| OPENMLDB_HOME | 当前发行版的根目录 | openmldb发行版根目录,不则使用当前根目录,也就是openmldb-0.8.4-linux所在目录。 | +| SPARK_HOME | $OPENMLDB_HOME/spark | openmldb spark发行版根目录,如果该目录不存在,自动从网上下载。**此路径也将成为TaskManager运行机器上的Spark安装目录。** | +| RUNNER_EXISTING_SPARK_HOME | | 配置此项,运行TaskManager的机器将使用该Spark环境,将不下载、部署OpenMLDB Spark发行版。 | +| OPENMLDB_USE_EXISTING_ZK_CLUSTER | false | 是否使用已经运行的ZooKeeper集群。如果是`true`,将跳过ZooKeeper集群的部署与管理。 | +| OPENMLDB_ZK_HOME | $OPENMLDB_HOME/zookeeper | ZooKeeper发行版根目录,如果该目录不存在,自动从网上下载。 | +| OPENMLDB_ZK_CLUSTER | | ZooKeeper集群地址,为空时自动从`conf/hosts`中的`[zookeeper]`配置获取。建议自建ZooKeeper集群时在hosts中创建,使用已有ZooKeeper集群时配置此项。 | +| OPENMLDB_ZK_ROOT_PATH | /openmldb | OpenMLDB在ZooKeeper集群的根目录 | +| OPENMLDB_FORCE_LOCAL | false | 如果为`true`,所有部署将认定为本地拷贝。单机部署集群,又需要使用公网IP时,开启此项,避免ssh | +| RUNNER_JAVA_HOME | | 运行ZooKeeper和TaskManager的机器ssh可能无Java相关环境变量,可使用此变量设置。不设置则不覆盖环境。 | +| CLEAR_OPENMLDB_INSTALL_DIR | false | sbin/clear-all.sh只清理运行产生的数据与日志,如果是`true`,将把运行机器上的整个安装目录删除。 | + +通常来讲,需要确认以下几点: +- ZooKeeper集群地址,如果使用已有ZooKeeper集群,需要配置`OPENMLDB_USE_EXISTING_ZK_CLUSTER=true`,并配置`OPENMLDB_ZK_CLUSTER`。(如果在`conf/hosts`中配置外部ZK集群,请注释标注其不受sbin部署影响,避免混乱。) +- 需要此工具部署ZooKeeper集群时,在`conf/hosts`中配置`[zookeeper]`。填写多个ZooKeeper节点,即部署ZooKeeper集群,无需额外配置。 +- Spark环境,如果需要使用运行机器上已有的Spark环境,需要配置`RUNNER_EXISTING_SPARK_HOME`(地址为TaskManager运行机器上的路径)。如果部署机器存在Spark环境,并想要在TaskManager机器上使用此套环境,可配置`SPARK_HOME`(部署到TaskManager机器同名路径上)。`SPARK_HOME`不进行配置时,将自动下载、使用OpenMLDB Spark发行版。 + +#### 默认端口 +| 环境变量 | 默认值 | 定义 | +| ------------------------- | ------ | -------------------- | +| OPENMLDB_TABLET_PORT | 10921 | TabletServer默认端口 | +| OPENMLDB_NAMESERVER_PORT | 7527 | NameServer默认端口 | +| OPENMLDB_TASKMANAGER_PORT | 9902 | TaskManager默认端口 | +| OPENMLDB_APISERVER_PORT | 9080 | APIServer默认端口 | + +默认端口只会在节点配置不显式配置端口号时才会被使用,更推荐**直接在节点配置文件hosts中配置好端口号**。 + +#### ZooKeeper高级配置 +| 环境变量 | 默认值 | 定义 | +| --------------------------------- | ------ | --------------------------------------------------------------------------------------- | +| OPENMLDB_ZK_CLUSTER_CLIENT_PORT | 2181 | ZooKeeper client port, 即zoo.cfg里面的clientPort | +| OPENMLDB_ZK_CLUSTER_PEER_PORT | 2888 | ZooKeeper peer port,即zoo.cfg里面这种配置server.1=zoo1:2888:3888中的第一个端口配置 | +| OPENMLDB_ZK_CLUSTER_ELECTION_PORT | 3888 | ZooKeeper election port, 即zoo.cfg里面这种配置server.1=zoo1:2888:3888中的第二个端口配置 | ### 节点配置 + 节点配置文件为`conf/hosts`,示例如下: ```bash [tablet] @@ -229,14 +257,16 @@ node3:2181:2888:3888 /tmp/openmldb/zk-1 对于`[zookeeper]`, 会有额外端口参数,包括follower用来连接leader的`zk_peer_port`和用于leader选择的`zk_election_port`, 其格式为`host:port:zk_peer_port:zk_election_port WORKDIR`。 -每一行节点列表,除了`host`是必须的,其他均为可选,如果没有提供,会使用默认配置,默认配置参考`conf/openmldb-env.sh`。 +每一行节点列表,除了`host`是必须的,其他均为可选,如果没有提供,会使用默认配置,默认配置参考`conf/openmldb-env.sh`。无`WORKDIR`配置的节点,所有OpenMLDB Server的默认运行目录为`OPENMLDB_HOME`,ZooKeeper默认目录为`OPENMLDB_ZK_HOME`。 + +host配置为localhost或127.0.0.1时,将自动识别为部署到本地,不会进行ssh和rsync。当集群在本地部署且需要对外暴露服务,hosts中节点需配置为外网IP,如果不想配置本机ssh免密,可在`conf/openmldb-env.sh`中配置`OPENMLDB_FORCE_LOCAL=true`。 ```{warning} 如果在不同机器上部署多个 TaskManager,其 `offline.data.prefix` 配置的路径,这些机器必须可以访问,建议配置hdfs路径。 ``` ### 修改机器环境配置 (可选) -``` +```bash bash sbin/init_env.sh ``` 说明: @@ -248,15 +278,40 @@ bash sbin/init_env.sh ```bash sbin/deploy-all.sh ``` -该脚本会把相关的文件分发到`conf/hosts`里面配置的机器上,同时根据`conf/hosts`和`conf/openmldb-env.sh` -的配置,对相关组件的配置做出相应的更新。 +该脚本会把相关的文件分发到`conf/hosts`里面配置的机器上,同时根据`conf/hosts`和`conf/openmldb-env.sh`的配置,对相关组件的配置做出相应的更新。 -如果希望为每个节点添加一些额外的相同的定制化配置,可以在执行deploy脚本之前,修改`conf/xx.template`的配置, -这样在分发配置文件的时候,每个节点都可以用到更改后的配置。 -重复执行`sbin/deploy-all.sh`会覆盖上一次的配置。 +如果希望为每个节点添加一些额外的相同的定制化配置,可以在执行deploy脚本之前,**修改`conf/xx.template`的配置**。只有和openmldb-env.sh中相关的配置会被部署工具自动追加到配置尾部,其他配置不会被覆盖,可放心修改。执行deploy,将配置文件分发到运行节点中,重复执行`sbin/deploy-all.sh`会覆盖上一次的配置。 详细配置说明见[配置文件](./conf.md),请注意TaskManager Spark的选择与细节配置[Spark Config详解](./conf.md#spark-config详解)。 +执行阶段日志类似下文,请注意部署到的host与目录: +``` +deploy tablet to localhost:10921 /tmp/openmldb/tablet-1 +copy /work/openmldb to localhost:/tmp/openmldb/tablet-1 +deploy tablet to localhost:10922 /tmp/openmldb/tablet-2 +copy /work/openmldb to localhost:/tmp/openmldb/tablet-2 +deploy nameserver to localhost:7527 /work/openmldb +skip rsync as dest=src: /work/openmldb +deploy apiserver to localhost:9080 /work/openmldb +skip rsync as dest=src: /work/openmldb +/work/openmldb/spark already exists. Skip deploy spark locally +deploy taskmanager to localhost:9902 /work/openmldb +skip rsync as dest=src: /work/openmldb +/work/openmldb/zookeeper already exists. Skip download zookeeper. +deploy zookeeper to localhost:2181 /tmp/openmldb/zk-1 +copy /work/openmldb/zookeeper to localhost:/tmp/openmldb/zk-1 +``` + +对环境变量有疑问,注意日志`OPENMLDB envs:`的打印结果。 + +- 配置 +deploy不支持对单个组件的配置更新,更改单个组件也需要使用`deploy-all.sh`。如果你在部署host上单独修改,需要修改`xx.flags`/`taskmanager.properties`而不是template配置,而且`deploy-all.sh`将对该配置进行覆盖,请谨慎配置。检查配置时以host的运行目录中的`xx.flags`/`taskmanager.properties`为准。 + +- 日志 +相应的,各个节点的日志也在各自的运行目录中,具体位置参考[部署方式二:手动部署](#部署方式二手动部署)中各个组件的日志位置说明。 + +收集日志与配置,可以使用诊断工具[检查内容](../maintain/diagnose.md#检查内容),默认将各个节点的配置和日志都收集到`/tmp/diag_collect`目录中,可以统一查看。 + ### 启动服务 普通模式启动: @@ -270,17 +325,26 @@ sbin/start-all.sh sbin/start-all.sh mon ``` -该脚本会把 `conf/hosts` 里面配置的所有服务启动起来。启动完成以后,可以通过辅助脚本启动 CLI (`sbin/openmldb-cli.sh`),来验证集群是否正常启动。 +该脚本会把 `conf/hosts` 里面配置的所有服务启动起来。启动完成以后,可以通过辅助脚本启动 CLI (`sbin/openmldb-cli.sh`),来验证集群是否正常启动。对环境变量有疑问,注意日志`OPENMLDB envs:`的打印结果。 ```{tip} start-all.sh 是一个非常有用的工具。除了在部署阶段可以使用,也可以在运维阶段用于启动某一个下线的 OpenMLDB 进程。比如某一个 tablet 进程意外下线,你可以直接执行 start-all.sh。该脚本对于已经启动的进程不会产生副作用,对于已配置、但是未启动的进程,将会自动进行启动。 ``` ### 停止服务 + 如果需要停止所有服务,可以执行以下脚本: ```bash sbin/stop-all.sh ``` +### 清理数据和日志 + +如果需要清理所有服务的数据和日志,可以执行以下脚本: +```bash +sbin/clean-all.sh +``` + +如果需要保留集群数据,请不要执行该脚本。 ## 部署方式二:手动部署 OpenMLDB集群版需要部署ZooKeeper、NameServer、TabletServer、TaskManager等模块。其中ZooKeeper用于服务发现和保存元数据信息。NameServer用于管理TabletServer,实现高可用和failover。TabletServer用于存储数据和主从同步数据。APIServer是可选的,如果要用http的方式和OpenMLDB交互需要部署此模块。TaskManager用于管理离线job。 @@ -530,6 +594,8 @@ cp conf/apiserver.flags.template conf/apiserver.flags **注意:** * 如果http请求并发度较大,可自行调大APIServer的线程数,`--thread_pool_size`,默认为16,重启生效。 +* 可以通过`--user`和`--password`指定连接服务端的用户名和密码 +* 默认会用root用户空密码去连接服务端,如果修改了root密码,需要用`--password`指定新密码 **3. 启动服务** @@ -591,6 +657,7 @@ cp conf/taskmanager.properties.template conf/taskmanager.properties * 修改`offline.data.prefix`为离线表存储路径,如果使用Yarn模式需要修改为对应HDFS路径。 * 修改`spark.master`为离线任务运行模式,目前支持local和yarn模式。 * 修改`spark.home`为Spark环境路径,如果不配置或配置为空则使用`SPARK_HOME`环境变量的配置。也可在配置文件中设置,路径为绝对路径。 +* 可以通过`user`和`password`指定连接server端用户名和密码。默认会用root用户空密码去连接服务端,如果修改了root密码,需要指定新密码. ``` server.host=172.27.128.33 diff --git a/docs/zh/faq/client_faq.md b/docs/zh/faq/client_faq.md index 3b3bff7e93c..a249f7875d5 100644 --- a/docs/zh/faq/client_faq.md +++ b/docs/zh/faq/client_faq.md @@ -3,12 +3,12 @@ ## fail to get tablet ... 的错误日志 优先检查集群中tablet server是否意外下线,或者在线表是否不可读写。推荐通过[openmldb_tool](../maintain/diagnose.md)诊断,使用`status`(status --diff)和`inspect online`两个检查命令。 -TODO diag tool 测到offline或online表不正常,会输出警告和下一步应该怎么操作? +TODO diag tool 测到offline或online表不正常,会输出警告和下一步应该怎么操作。 如果只能手动检查,需要两步: - `show components`,检查server是否存在在列表中(TaskManager如果下线,将不在表中。Tablet如果下线,将在表中,但状态为offline),以及在列表中的server的状态是否为online。如果存在offline的server,**先将server重启加入集群**。 - `show table status like '%'`(低版本如果不支持like,需要分别查询系统db和用户db),检查每个表的"Warnings"是否报错。 -一般会得到`real replica number X does not match the configured replicanum X`等错误,具体错误信息请参考[SHOW TABLE STATUS](../openmldb_sql/ddl/SHOW_TABLE_STATUS.md)。这些错误都说明表目前是有问题的,无法提供正常读写功能,通常是由于Tablet +一般会得到`real replica number X does not match the configured replicanum X`等错误,具体错误信息请参考[SHOW TABLE STATUS](../openmldb_sql/ddl/SHOW_TABLE_STATUS.md)。这些错误都说明表目前是有问题的,无法提供正常读写功能,通常是由于Tablet。 ## 为什么收到 Reached timeout 的警告日志? ``` @@ -25,7 +25,7 @@ rpc_client.h:xxx] request error. [E1008] Reached timeout=xxxms ``` rpc_client.h:xxx] request error. [E1014]Got EOF of Socket{id=x fd=x addr=xxx} (xx) ``` -这是因为`addr`端主动断开了连接,`addr`的地址大概率是TaskManager。这不代表TaskManager不正常,而是TaskManager端认为这个连接没有活动,超过keepAliveTime了,而主动断开通信channel。 +这是因为`addr`端主动断开了连接,`addr`的地址大概率是TaskManager。这不代表TaskManager不正常,而是TaskManager端认为这个连接没有活动,超过`keepAliveTime`了,而主动断开通信channel。 在0.5.0及以后的版本中,可以调大TaskManager的`server.channel_keep_alive_time`来提高对不活跃channel的容忍度。默认值为1800s(0.5h),特别是使用同步的离线命令时,这个值可能需要适当调大。 在0.5.0以前的版本中,无法更改此配置,请升级TaskManager版本。 @@ -70,7 +70,7 @@ sdk日志(glog日志): ## 插入错误,日志显示`please use getInsertRow with ... first` -在JAVA client使用InsertPreparedStatement进行插入,或在Python中使用sql和parameter进行插入时,client底层实际有cache影响,第一步`getInsertRow`生成sql cache并返回sql还需要补充的parameter信息,第二步才会真正执行insert,而执行insert需要使用第一步缓存的sql cache。所以,当多线程使用同一个client时,可能因为插入和查询频繁更新cache表,将你想要执行的insert sql cache淘汰掉了,所以会出现好像第一步`getInsertRow`并未执行的样子。 +在JAVA client使用`InsertPreparedStatement`进行插入,或在Python中使用sql和parameter进行插入时,client底层实际有cache影响,第一步`getInsertRow`生成sql cache并返回sql还需要补充的parameter信息,第二步才会真正执行insert,而执行insert需要使用第一步缓存的sql cache。所以,当多线程使用同一个client时,可能因为插入和查询频繁更新cache表,将你想要执行的insert sql cache淘汰掉了,所以会出现好像第一步`getInsertRow`并未执行的样子。 目前可以通过调大`maxSqlCacheSize`这一配置项来避免错误。仅JAVA/Python SDK支持配置。 diff --git a/docs/zh/faq/server_faq.md b/docs/zh/faq/server_faq.md index 1b89fd383d6..a0297df6995 100644 --- a/docs/zh/faq/server_faq.md +++ b/docs/zh/faq/server_faq.md @@ -1,6 +1,6 @@ # Server FAQ -Server中有任何上下线变化或问题,都先openmldb_tool status + inspect online检查下集群是否正常。 +Server中有任何上下线变化或问题,都先`openmldb_tool status` 和 `inspect online`检查下集群是否正常。 ## 部署和启动 FAQ @@ -8,7 +8,7 @@ Server中有任何上下线变化或问题,都先openmldb_tool status + inspec 虽然有一键启动脚本,但由于配置繁多,可能出现“端口已被占用”,“目录无读写权限”等问题。这些问题都是server进程运行之后才能发现,退出后没有及时反馈。(如果配置了监控,可以通过监控直接检查。) 所以,请先确认集群的所有server进程都正常运行。 -可以通过`ps axu | grep openmldb`或sql命令`show components;`来查询。(注意,如果你使用了守护进程,openmldb server进程可能是在启动停止的循环中,并不代表持续运行,可以通过日志或`show components;`连接时间来确认。) +可以通过`ps aux | grep openmldb`或sql命令`show components;`来查询。(注意,如果你使用了守护进程,openmldb server进程可能是在启动停止的循环中,并不代表持续运行,可以通过日志或`show components;`连接时间来确认。) 如果进程都活着,集群还是表现不正常,需要查询一下server日志。可以优先看WARN和ERROR级日志,很大概率上,它们就是根本原因。 @@ -18,7 +18,7 @@ Server中有任何上下线变化或问题,都先openmldb_tool status + inspec - tablet异常退出 - 多副本表多个副本所在的tablets同时重启或者重启太快,造成某些`auto_failover`操作还没完成tablet就重启 -- auto_failover设成`false` +- `auto_failover`设成`false` 当服务启动成功后,可以通过`gettablestatus`获得所有表的状态: ``` @@ -39,7 +39,7 @@ http_rpc_protocol.cpp:911] Fail to write into Socket{id=xx fd=xx addr=xxx} (0x7a 这是server端会打印的日志。一般是client端使用了连接池或短连接模式,在RPC超时后会关闭连接,server写回response时发现连接已经关了就报这个错。Got EOF就是指之前已经收到了EOF(对端正常关闭了连接)。client端使用单连接模式server端一般不会报这个。 ### 2. 表数据的ttl初始设置不合适,如何调整? -这需要使用nsclient来修改,普通client无法做到。nsclient启动方式与命令,见[ns client](../maintain/cli.md#ns-client)。 +这需要使用nsclient来修改,普通client无法做到。nsclient启动方式与命令,见[NS Client](../maintain/cli.md#ns-client)。 在nsclient中使用命令`setttl`可以更改一个表的ttl,类似 ``` diff --git a/docs/zh/index.rst b/docs/zh/index.rst index 2a1e0e52346..cd827813914 100644 --- a/docs/zh/index.rst +++ b/docs/zh/index.rst @@ -23,5 +23,5 @@ OpenMLDB 文档 (|version|) :hidden: :caption: 📚 应用生态 - app_ecosystem/feature_platform/index + app_ecosystem/feat_insight/index app_ecosystem/sql_emulator/index diff --git a/docs/zh/integration/offline_data_sources/hive.md b/docs/zh/integration/offline_data_sources/hive.md index 286f14d9684..5e48115b9c8 100644 --- a/docs/zh/integration/offline_data_sources/hive.md +++ b/docs/zh/integration/offline_data_sources/hive.md @@ -26,14 +26,14 @@ ### 配置 -目前 OpenMLDB 只支持使用 metastore 服务来连接Hive。你可以在以下两种配置方式中选择一种,来访问 Hive 数据源。 +目前 OpenMLDB 只支持使用 metastore 服务来连接Hive。你可以在以下两种配置方式中选择一种,来访问 Hive 数据源。测试搭建的HIVE环境简单,通常只需要配置`hive.metastore.uris`即可。但生产环境中,可能需要配置更多的Hive配置,更推荐使用`hive-site.xml`的方式。 -- spark.conf:你可以在 spark conf 中配置 `spark.hadoop.hive.metastore.uris`。有两种方式: +- spark.conf:你可以在 spark conf 中配置 `spark.hadoop.hive.metastore.uris`等相关配置。有两种方式: - taskmanager.properties: 在配置项 `spark.default.conf` 中加入`spark.hadoop.hive.metastore.uris=thrift://...` ,随后重启taskmanager。 - CLI: 在 ini conf 中加入此配置项,并使用`--spark_conf`启动CLI,参考[客户端Spark配置文件](../../reference/client_config/client_spark_config.md)。 -- hive-site.xml:你可以配置 `hive-site.xml` 中的 `hive.metastore.uris`,并将配置文件放入 Spark home的`conf/`(如果已配置`HADOOP_CONF_DIR`环境变量,也可以将配置文件放入`HADOOP_CONF_DIR`中)。`hive-site.xml` 样例: +- hive-site.xml:你可以将HIVE的配置 `hive-site.xml` 放入 Spark home的`conf/`(如果已配置`HADOOP_CONF_DIR`环境变量,也可以将配置文件放入`HADOOP_CONF_DIR`中)。`hive-site.xml` 样例: ```xml @@ -122,7 +122,7 @@ LOAD DATA INFILE 'hive://db1.t1' INTO TABLE db1.t1 OPTIONS(deep_copy=true, sql=' 对于 Hive 数据源的导出是通过 API [`SELECT INTO`](../../openmldb_sql/dql/SELECT_INTO_STATEMENT.md) 进行支持,通过使用特定的 URI 接口 `hive://[db].table` 的格式进行导出到 Hive 数仓。注意: -- 如果不指定数据库名字,则会使用默认数据库名字 `default_db` +- 如果不指定Hive数据库名字,则会使用Hive默认数据库 `default` - 如果指定数据库名字,则该数据库必须已经存在,目前不支持对于不存在的数据库进行自动创建 - 如果指定的Hive表名不存在,则会在 Hive 内自动创建对应名字的表 - `OPTIONS` 参数只有导出模式`mode`生效,其他参数均不生效 diff --git a/docs/zh/integration/offline_data_sources/iceberg.md b/docs/zh/integration/offline_data_sources/iceberg.md new file mode 100644 index 00000000000..12088f5e990 --- /dev/null +++ b/docs/zh/integration/offline_data_sources/iceberg.md @@ -0,0 +1,132 @@ +# Iceberg + +## 简介 + +[Apache Iceberg](https://iceberg.apache.org/) 是一个开源的大数据表格格式。Iceberg可以在Spark、Trino、PrestoDB、Flink、Hive和Impala等计算引擎中添加表格,使用高性能的表格格式,就像SQL表格一样。OpenMLDB 支持使用 Iceberg 作为离线存储引擎,导入数据和导出特征计算数据。 + +## 配置 + +### 安装 + +[OpenMLDB Spark 发行版](../../tutorial/openmldbspark_distribution.md) v0.8.5 及以上版本均已经包含 Iceberg 1.4.3 依赖。如果你需要与其他iceberg版本或者其他Spark发行版一起使用,你可以从[Iceberg release](https://iceberg.apache.org/releases/)下载对应的Iceberg依赖,并将其添加到Spark的classpath/jars中。例如,如果你使用的是OpenMLDB Spark,你应该下载`x.x.x Spark 3.2_12 runtime Jar`(x.x.x is iceberg version)并将其添加到Spark home的`jars/`中。 + +### 配置 + +你需要将catalog配置添加到Spark配置中。有两种方式: + +- taskmanager.properties(.template): 在配置项 `spark.default.conf` 中加入Iceberg配置,随后重启taskmanager。 +- CLI: 在 ini conf 中加入此配置项,并使用`--spark_conf`启动CLI,参考[客户端Spark配置文件](../../reference/client_config/client_spark_config.md)。 + +Iceberg配置详情参考[Iceberg Configuration](https://iceberg.apache.org/docs/latest/spark-configuration/)。 + +例如,在`taskmanager.properties(.template)`中设置hive catalog: + +```properties +spark.default.conf=spark.sql.catalog.hive_prod=org.apache.iceberg.spark.SparkCatalog;spark.sql.catalog.hive_prod.type=hive;spark.sql.catalog.hive_prod.uri=thrift://metastore-host:port +``` + +如果需要创建iceberg表,还需要配置`spark.sql.catalog.hive_prod.warehouse`。 + +设置 hadoop catalog: + +```properties +spark.default.conf=spark.sql.catalog.hadoop_prod=org.apache.iceberg.hadoop.HadoopCatalog;spark.sql.catalog.hadoop_prod.type=hadoop;spark.sql.catalog.hadoop_prod.warehouse=hdfs://hadoop-namenode:port/warehouse +``` + +设置 rest catalog: + +```properties +spark.default.conf=spark.sql.catalog.rest_prod=org.apache.iceberg.spark.SparkCatalog;spark.sql.catalog.rest_prod.catalog-impl=org.apache.iceberg.rest.RESTCatalog;spark.sql.catalog.rest_prod.uri=http://iceberg-rest:8181/ +``` + +Iceberg catalog的完整配置参考[Iceberg Catalog Configuration](https://iceberg.apache.org/docs/latest/spark-configuration/)。 + +任一配置成功后,均使用`..`的格式访问Iceberg表。如果不想使用``,可以在配置中设置`spark.sql.catalog.default=`。也可添加`spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog`,`spark.sql.catalog.spark_catalog.type=hive`,让iceberg catalog合入spark catalog中(非iceberg表仍然存在于spark catalog中),这样可以使用`.`的格式访问Iceberg表。 + +### 调试信息 + +成功连接Iceberg Hive Catalog后,你可以在日志中看到类似以下的信息: + +``` +24/01/30 09:01:05 INFO SharedState: Setting hive.metastore.warehouse.dir ('hdfs://namenode:19000/user/hive/warehouse') to the value of spark.sql.warehouse.dir. +24/01/30 09:01:05 INFO SharedState: Warehouse path is 'hdfs://namenode:19000/user/hive/warehouse'. +... +24/01/30 09:01:06 INFO HiveUtils: Initializing HiveMetastoreConnection version 2.3.9 using Spark classes. +24/01/30 09:01:06 INFO HiveClientImpl: Warehouse location for Hive client (version 2.3.9) is hdfs://namenode:19000/user/hive/warehouse +24/01/30 09:01:06 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/01/30 09:01:06 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +24/01/30 09:01:06 INFO HiveMetaStore: 0: Opening raw store with implementation class:org.apache.hadoop.hive.metastore.ObjectStore +24/01/30 09:01:06 INFO ObjectStore: ObjectStore, initialize called +24/01/30 09:01:06 INFO Persistence: Property hive.metastore.integral.jdo.pushdown unknown - will be ignored +24/01/30 09:01:06 INFO Persistence: Property datanucleus.cache.level2 unknown - will be ignored +24/01/30 09:01:07 INFO ObjectStore: Setting MetaStore object pin classes with hive.metastore.cache.pinobjtypes="Table,StorageDescriptor,SerDeInfo,Partition,Database,Type,FieldSchema,Order" +24/01/30 09:01:07 INFO MetaStoreDirectSql: Using direct SQL, underlying DB is POSTGRES +24/01/30 09:01:07 INFO ObjectStore: Initialized ObjectStore +24/01/30 09:01:08 INFO HiveMetaStore: Added admin role in metastore +24/01/30 09:01:08 INFO HiveMetaStore: Added public role in metastore +24/01/30 09:01:08 INFO HiveMetaStore: No user is added in admin role, since config is empty +24/01/30 09:01:08 INFO HiveMetaStore: 0: get_database: default +``` + +导出到Iceberg时,你可以检查任务日志,应该有类似以下的信息: + +``` +24/01/30 09:57:29 INFO AtomicReplaceTableAsSelectExec: Start processing data source write support: IcebergBatchWrite(table=nyc.taxis_out, format=PARQUET). The input RDD has 1 partitions. +... +24/01/30 09:57:31 INFO AtomicReplaceTableAsSelectExec: Data source write support IcebergBatchWrite(table=nyc.taxis_out, format=PARQUET) committed. +... +24/01/30 09:57:31 INFO HiveTableOperations: Committed to table hive_prod.nyc.taxis_out with the new metadata location hdfs://namenode:19000/user/hive/iceberg_storage/nyc.db/taxis_out/metadata/00001-038d8b81-04a6-4a19-bb83-275eb4664937.metadata.json +24/01/30 09:57:31 INFO BaseMetastoreTableOperations: Successfully committed to table hive_prod.nyc.taxis_out in 224 ms +``` + +## 数据格式 + +Iceberg schema参考[Iceberg Schema](https://iceberg.apache.org/spec/#schema)。目前,仅支持以下Iceberg数据格式: + +| OpenMLDB 数据格式 | Iceberg 数据格式 | +| ----------------- | ---------------- | +| BOOL | bool | +| INT | int | +| BIGINT | long | +| FLOAT | float | +| DOUBLE | double | +| DATE | date | +| TIMESTAMP | timestamp | +| STRING | string | + +## 导入 Iceberg 数据到 OpenMLDB + +从 Iceberg 表导入数据,需要使用 [`LOAD DATA INFILE`](../../openmldb_sql/dml/LOAD_DATA_STATEMENT.md) 语句。这个语句使用特殊的 URI 格式 `hive://[db].table`,可以无缝地从 Iceberg 导入数据。以下是一些重要的注意事项: + +- 离线引擎和在线引擎都可以从 Iceberg 表导入数据。 +- 离线导入支持软链接,但是在线导入不支持软链接。使用软链接时,需要在导入OPTIONS中指定 `deep_copy=false`。 +- Iceberg 表导入只有三个参数有效: `deep_copy`, `mode` and `sql`。其他格式参数`delimiter`,`quote`等均无效。 + +例如,通过Iceberg Hive Catalog导入数据: + +```sql +LOAD DATA INFILE 'iceberg://hive_prod.db1.t1' INTO TABLE t1 OPTIONS(deep_copy=false); +-- or +LOAD DATA INFILE 'hive_prod.db1.t1' INTO TABLE t1 OPTIONS(deep_copy=false, format='iceberg'); +``` + +数据导入支持`sql`参数,筛选出表种的特定数据进行导入,注意 SQL 必须符合 SparkSQL 语法,数据表为注册后的表名,不带 `iceberg://` 前缀。 + +```sql +LOAD DATA INFILE 'iceberg://hive_prod.db1.t1' INTO TABLE t1 OPTIONS(deep_copy=false, sql='select * from t1 where id > 100'); +``` + +## 导出 OpenMLDB 数据到 Iceberg + +从 OpenMLDB 导出数据到 Iceberg 表,需要使用 [`SELECT INTO`](../../openmldb_sql/dql/SELECT_INTO_STATEMENT.md) 语句,这个语句使用特殊的 URI 格式 `iceberg://[db].table`,可以无缝地导出数据到 Iceberg 表。以下是一些重要的注意事项: + +- 如果不指定Iceberg数据库名字,则会使用Iceberg默认数据库`default` +- 如果指定Iceberg数据库名字,则该数据库必须已经存在,目前不支持对于不存在的数据库进行自动创建 +- 如果指定的Iceberg表名不存在,则会在 Iceberg 内自动创建对应名字的表 +- `OPTIONS` 参数只有导出模式`mode`生效,其他参数均不生效 + +举例: + +```sql +SELECT col1, col2, col3 FROM t1 INTO OUTFILE 'iceberg://hive_prod.db1.t1'; +``` diff --git a/docs/zh/integration/offline_data_sources/index.rst b/docs/zh/integration/offline_data_sources/index.rst index e7ee72aec6c..ef2877aee8a 100644 --- a/docs/zh/integration/offline_data_sources/index.rst +++ b/docs/zh/integration/offline_data_sources/index.rst @@ -6,4 +6,5 @@ :maxdepth: 1 hive - s3 \ No newline at end of file + s3 + iceberg diff --git a/docs/zh/maintain/diagnose.md b/docs/zh/maintain/diagnose.md index cb5d7a30f74..17ba3319c79 100644 --- a/docs/zh/maintain/diagnose.md +++ b/docs/zh/maintain/diagnose.md @@ -185,7 +185,7 @@ JOB 检查是更灵活的离线任务检查命令,可以按条件筛选job, ### static-check 静态检查 -`static-check`静态检查,根据集群部署配置文件(通过参数`-f,--conf_file`指定),登录各个服务组件的部署地址,可以收集版本信息、配置文件、日志文件,检查版本是否一致,对收集到的配置文件和日志文件做分析。可以在集群未部署前进行检查,避免因程序版本或配置文件错误导致的集群部署失败。或在集群异常时,将分布式的日志文件收集在一起,方便调查问题。 +`static-check`静态检查,根据集群部署配置文件(通过参数`-f,--conf_file`指定),登录各个服务组件的部署地址,可以收集版本信息、配置文件、日志文件,检查版本是否一致,对收集到的配置文件和日志文件做分析。可以在集群*未部署前*进行检查,避免因程序版本或配置文件错误导致的集群部署失败。或在集群异常时,将分布式的日志文件收集在一起,方便调查问题。 ```bash openmldb_tool static-check -h @@ -260,7 +260,7 @@ nameserver: 检查可通过组合FLAG来来指定检查哪些内容,例如,`-V`只检查版本,`-CL`只检查配置文件和日志,`-VCL`检查全部。 -- `-V,--version`检查版本,检查各个组件的版本是否一致,如果不一致,会输出不一致的组件和版本信息。 +- `-V,--version`检查版本,检查各个组件的版本是否一致,如果不一致,会输出不一致的组件和版本信息(由于复杂度较高,openmldb-batch包的地址可能查不到,将忽略检查,替换batch包非常容易,可以推后检查)。 - `-C,--conf`收集配置文件,检查各个组件的配置文件中ZooKeeper地址是否一致等。 - `-L,--log`收集日志,输出WARNING及以上的日志。 diff --git a/docs/zh/openmldb_sql/ddl/ALTER_USER_STATEMENT.md b/docs/zh/openmldb_sql/ddl/ALTER_USER_STATEMENT.md new file mode 100644 index 00000000000..4b893a32472 --- /dev/null +++ b/docs/zh/openmldb_sql/ddl/ALTER_USER_STATEMENT.md @@ -0,0 +1,45 @@ +# ALTER USER + +`ALTER USER` 语句可用来修改用户密码。 + +## 语法 +```sql +AlterUserstmt ::= + 'ALTER' 'USER' [IF EXISTS] UserName SET OptOptionsList + +UserName ::= Identifier + +OptOptionsList ::= + "OPTIONS" OptionList + +OptionList ::= + OptionsListPrefix ")" + +OptionsListPrefix ::= + "(" OptionEntry + | OptionsListPrefix "," OptionEntry + +OptionEntry ::= + Identifier "=" Identifier +``` + +## **示例** +```sql +ALTER USER user1; +-- SUCCEED +ALTER USER IF EXISTS user2 SET OPTIONS(password='123456'); +-- SUCCEED +ALTER USER user3 SET OPTIONS (password='123456'); +-- SUCCEED +``` + +```{note} +1. 如果不指定OPTIONS密码不会修改 +2. OPTIONS中只能指定password +``` + +## 相关SQL + +[CREATE USER](./CREATE_USER_STATEMENT.md) +[DROP USER](./DROP_USER_STATEMENT.md) +[SHOW CURRENT_USER](./SHOW_CURRENT_USER_STATEMENT.md) \ No newline at end of file diff --git a/docs/zh/openmldb_sql/ddl/CREATE_TABLE_STATEMENT.md b/docs/zh/openmldb_sql/ddl/CREATE_TABLE_STATEMENT.md index a44f699eed3..0113ef730b0 100644 --- a/docs/zh/openmldb_sql/ddl/CREATE_TABLE_STATEMENT.md +++ b/docs/zh/openmldb_sql/ddl/CREATE_TABLE_STATEMENT.md @@ -233,8 +233,8 @@ IndexOption ::= | ----------- | ------------------------------------------------------------ | ---------------------------------------------------- | ------------------------------------------------------------ | | `ABSOLUTE` | TTL的值代表过期时间。配置值为时间段如`100m, 12h, 1d, 365d`。最大可以配置的过期时间为`15768000m`(即30年) | 当记录过期时,会被淘汰。 | `INDEX(KEY=col1, TS=std_time, TTL_TYPE=absolute, TTL=100m)`
OpenMLDB将会删除100分钟之前的数据。 | | `LATEST` | TTL的值代表最大存活条数。即同一个索引下面,最大允许存在的数据条数。最大可以配置1000条 | 记录超过最大条数时,会被淘汰。 | `INDEX(KEY=col1, TS=std_time, TTL_TYPE=LATEST, TTL=10)`。OpenMLDB只会保留最近10条记录,删除以前的记录。 | -| `ABSORLAT` | 配置过期时间和最大存活条数。配置值是一个2元组,形如`(100m, 10), (1d, 1)`。最大可以配置`(15768000m, 1000)`。 | 当且仅当记录过期**或**记录超过最大条数时,才会淘汰。 | `INDEX(key=c1, ts=c6, ttl=(120min, 100), ttl_type=absorlat)`。当记录超过100条,**或者**当记录过期时,会被淘汰 | -| `ABSANDLAT` | 配置过期时间和最大存活条数。配置值是一个2元组,形如`(100m, 10), (1d, 1)`。最大可以配置`(15768000m, 1000)`。 | 当记录过期**且**记录超过最大条数时,记录会被淘汰。 | `INDEX(key=c1, ts=c6, ttl=(120min, 100), ttl_type=absandlat)`。当记录超过100条,**而且**记录过期时,会被淘汰 | +| `ABSORLAT` | 配置过期时间和最大存活条数。配置值是一个2元组,形如`(100m, 10), (1d, 1)`。最大可以配置`(15768000m, 1000)`。 | 当且仅当记录过期**或**记录超过最大条数时,才会淘汰。 | `INDEX(key=c1, ts=c6, ttl=(120m, 100), ttl_type=absorlat)`。当记录超过100条,**或者**当记录过期时,会被淘汰 | +| `ABSANDLAT` | 配置过期时间和最大存活条数。配置值是一个2元组,形如`(100m, 10), (1d, 1)`。最大可以配置`(15768000m, 1000)`。 | 当记录过期**且**记录超过最大条数时,记录会被淘汰。 | `INDEX(key=c1, ts=c6, ttl=(120m, 100), ttl_type=absandlat)`。当记录超过100条,**而且**记录过期时,会被淘汰 | ```{note} 最大过期时间和最大存活条数的限制,是出于性能考虑。如果你一定要配置更大的TTL值,请使用UpdateTTL来增大(可无视max限制),或者调整nameserver配置`absolute_ttl_max`和`latest_ttl_max`,重启生效。 diff --git a/docs/zh/openmldb_sql/ddl/CREATE_USER_STATEMENT.md b/docs/zh/openmldb_sql/ddl/CREATE_USER_STATEMENT.md new file mode 100644 index 00000000000..0d08f9ab1e7 --- /dev/null +++ b/docs/zh/openmldb_sql/ddl/CREATE_USER_STATEMENT.md @@ -0,0 +1,45 @@ +# CREATE USER + +`CREATE USER` 语句用来创建用户。 + +## 语法 +```sql +CreateUserstmt ::= + 'CREATE' 'USER' [IF NOT EXISTS] UserName OptOptionsList + +UserName ::= Identifier + +OptOptionsList ::= + "OPTIONS" OptionList + +OptionList ::= + OptionsListPrefix ")" + +OptionsListPrefix ::= + "(" OptionEntry + | OptionsListPrefix "," OptionEntry + +OptionEntry ::= + Identifier "=" Identifier +``` + +## **示例** +```sql +CREATE USER user1; +-- SUCCEED +CREATE USER IF NOT EXISTS user2; +-- SUCCEED +CREATE USER user3 OPTIONS (password='123456'); +-- SUCCEED +``` + +```{note} +1. OPTIONS中只能指定password +2. 如果不指定password, 那么密码为空 +``` + +## 相关SQL + +[DROP USER](./DROP_USER_STATEMENT.md) +[ALTER USER](./ALTER_USER_STATEMENT.md) +[SHOW CURRENT_USER](./SHOW_CURRENT_USER_STATEMENT.md) \ No newline at end of file diff --git a/docs/zh/openmldb_sql/ddl/DESC_STATEMENT.md b/docs/zh/openmldb_sql/ddl/DESC_STATEMENT.md index ca0d0de87bf..2384e73d1e8 100644 --- a/docs/zh/openmldb_sql/ddl/DESC_STATEMENT.md +++ b/docs/zh/openmldb_sql/ddl/DESC_STATEMENT.md @@ -37,7 +37,7 @@ USE db1; -- SUCCEED: Database changed ``` -创建两张表: +创建一张自定义索引的表: ```sql CREATE TABLE t1 (col0 STRING, col1 int, std_time TIMESTAMP, INDEX(KEY=col1, TS=std_time, TTL_TYPE=absolute, TTL=30d)); @@ -64,6 +64,37 @@ desc t1; ``` +有离线数据的表: + +```sql + --- ------- ----------- ------ --------- + # Field Type Null Default + --- ------- ----------- ------ --------- + 1 c1 Varchar YES + 2 c2 Int YES + 3 c3 BigInt YES + 4 c4 Float YES + 5 c5 Double YES + 6 c6 Timestamp YES + 7 c7 Date YES + --- ------- ----------- ------ --------- + --- -------------------- ------ ---- ------ --------------- + # name keys ts ttl ttl_type + --- -------------------- ------ ---- ------ --------------- + 1 INDEX_0_1705743486 c1 - 0min kAbsoluteTime + --- -------------------- ------ ---- ------ --------------- + ---------------------------------------------------------- ------------------------------------------ --------- --------- + Data path Symbolic paths Format Options + ---------------------------------------------------------- ------------------------------------------ --------- --------- + file:///tmp/openmldb_offline_storage/demo_db/demo_table1 file:///work/taxi-trip/data/data.parquet parquet + ---------------------------------------------------------- ------------------------------------------ --------- --------- + + --------------- -------------- + compress_type storage_mode + --------------- -------------- + NoCompress Memory + --------------- -------------- +``` ## 相关语句 diff --git a/docs/zh/openmldb_sql/ddl/DROP_USER_STATEMENT.md b/docs/zh/openmldb_sql/ddl/DROP_USER_STATEMENT.md new file mode 100644 index 00000000000..5c4775adb81 --- /dev/null +++ b/docs/zh/openmldb_sql/ddl/DROP_USER_STATEMENT.md @@ -0,0 +1,29 @@ +# DROP USER + +`DROP USER` 语句用来删除用户。 + +## 语法 +```sql +DropUserstmt ::= + 'DROP' 'USER' [IF EXISTS] UserName + +UserName ::= Identifier +``` + +## **示例** +```sql +DROP USER user1; +-- SUCCEED +DROP USER IF EXISTS user2; +-- SUCCEED +``` + +```{note} +1. 不能删除root用户 +``` + +## 相关SQL + +[CREATE USER](./CREATE_USER_STATEMENT.md) +[ALTER USER](./ALTER_USER_STATEMENT.md) +[SHOW CURRENT_USER](./SHOW_CURRENT_USER_STATEMENT.md) \ No newline at end of file diff --git a/docs/zh/openmldb_sql/ddl/SHOW_CURRENT_USER_STATEMENT.md b/docs/zh/openmldb_sql/ddl/SHOW_CURRENT_USER_STATEMENT.md new file mode 100644 index 00000000000..0c6548acb9e --- /dev/null +++ b/docs/zh/openmldb_sql/ddl/SHOW_CURRENT_USER_STATEMENT.md @@ -0,0 +1,17 @@ +# SHOW CURRENT_USER + +`SHOW CURRENT_USER` 显示当前用户 + +## **示例** +```sql +SHOW CURRENT_USER; + ------ + User + ------ + root + ------ +``` + +[CREATE USER](./CREATE_USER_STATEMENT.md) +[ALTER USER](./ALTER_USER_STATEMENT.md) +[DROP USER](./DROP_USER_STATEMENT.md) \ No newline at end of file diff --git a/docs/zh/openmldb_sql/dml/DELETE_STATEMENT.md b/docs/zh/openmldb_sql/dml/DELETE_STATEMENT.md index 716af9351fd..729f32b53a1 100644 --- a/docs/zh/openmldb_sql/dml/DELETE_STATEMENT.md +++ b/docs/zh/openmldb_sql/dml/DELETE_STATEMENT.md @@ -12,7 +12,6 @@ TableName ::= **说明** -- `DELETE` 语句删除在线表满足指定条件的数据,删除并不是所有索引中满足条件的数据都被删除,只会删除与where condition相关的索引,示例见[功能边界](../../quickstart/function_boundary.md#delete)。 - `WHERE` 指定的筛选列必须是索引列。如果是key列只能用等于 ## Examples diff --git a/docs/zh/openmldb_sql/dml/INSERT_STATEMENT.md b/docs/zh/openmldb_sql/dml/INSERT_STATEMENT.md index 6ecf98390a3..4799e557577 100644 --- a/docs/zh/openmldb_sql/dml/INSERT_STATEMENT.md +++ b/docs/zh/openmldb_sql/dml/INSERT_STATEMENT.md @@ -5,7 +5,7 @@ OpenMLDB 支持一次插入单行或多行数据。 ## syntax ``` -INSERT INFO tbl_name (column_list) VALUES (value_list) [, value_list ...] +INSERT [[OR] IGNORE] INTO tbl_name (column_list) VALUES (value_list) [, value_list ...] column_list: col_name [, col_name] ... @@ -16,6 +16,7 @@ value_list: **说明** - `INSERT` 只能用在在线模式 +- 默认`INSERT`不会去重,`INSERT OR IGNORE` 则可以忽略已存在于表中的数据,可以反复重试。 ## Examples diff --git a/docs/zh/openmldb_sql/dml/LOAD_DATA_STATEMENT.md b/docs/zh/openmldb_sql/dml/LOAD_DATA_STATEMENT.md index b3c7ffc55bf..48b72943675 100644 --- a/docs/zh/openmldb_sql/dml/LOAD_DATA_STATEMENT.md +++ b/docs/zh/openmldb_sql/dml/LOAD_DATA_STATEMENT.md @@ -1,10 +1,10 @@ # LOAD DATA INFILE -`LOAD DATA INFILE`语句能高效地将文件中的数据读取到数据库中的表中。`LOAD DATA INFILE` 与 `SELECT INTO OUTFILE`互补。要将数据从 table导出到文件,请使用[SELECT INTO OUTFILE](../dql/SELECT_INTO_STATEMENT.md)。要将文件数据导入到 table 中,请使用`LOAD DATA INFILE`。 +`LOAD DATA INFILE`语句能高效地将文件中的数据读取到数据库中的表中。`LOAD DATA INFILE` 与 `SELECT INTO OUTFILE`互补。要将数据从 table导出到文件,请使用[SELECT INTO OUTFILE](../dql/SELECT_INTO_STATEMENT.md)。要将文件数据导入到 table 中,请使用`LOAD DATA INFILE`。注意,导入的文件schema顺序应与表的schema顺序一致。 ```{note} 无论何种load_mode,INFILE 的 filePath既可以是单个文件名,也可以是目录,也可以使用`*`通配符。 - load_mode=cluster的具体格式等价于`DataFrameReader.read.load(String)`,可以使用spark shell来read你想要的文件路径,确认能否读入成功。如果目录中存在多格式的文件,只会选择 LoadDataInfileOptionsList 中指定的FORMAT格式文件。 -- load_mode=local则使用glob选择出符合的所有文件,不会检查单个文件的格式,所以,请保证满足条件的是csv格式,建议使用`*.csv`限制文件格式。 +- load_mode=local则使用glob选择出符合的所有文件,不会检查单个文件的格式,所以,请保证满足条件的文件都是csv格式,建议使用`*.csv`限制文件格式。 ``` ## Syntax @@ -55,9 +55,10 @@ FilePathPattern | quote | String | " | 输入数据的包围字符串。字符串长度<=1。
load_mode=`cluster`默认为双引号`"`。配置包围字符后,被包围字符包围的内容将作为一个整体解析。例如,当配置包围字符串为"#"时, `1, 1.0, #This is a string field, even there is a comma#, normal_string`将为解析为三个filed,第一个是整数1,第二个是浮点1.0,第三个是一个字符串,第四个虽然没有quote,但也是一个字符串。
**local_mode=`local`默认为`\0`,也可使用空字符串赋值,不处理包围字符。** | | mode | String | "error_if_exists" | 导入模式:
`error_if_exists`: 仅离线模式可用,若离线表已有数据则报错。
`overwrite`: 仅离线模式可用,数据将覆盖离线表数据。
`append`:离线在线均可用,若文件已存在,数据将追加到原文件后面。
**local_mode=`local`默认为`append`** | | deep_copy | Boolean | true | `deep_copy=false`仅支持离线load, 可以指定`INFILE` Path为该表的离线存储地址,从而不需要硬拷贝。 | -| load_mode | String | cluster | `load_mode='local'`仅支持从csv本地文件导入在线存储, 它通过本地客户端同步插入数据;
`load_mode='cluster'`仅支持集群版, 通过spark插入数据,支持同步或异步模式 | +| load_mode | String | cluster | `load_mode='local'`仅支持从csv本地文件导入在线存储, 它通过本地客户端同步插入数据;
`load_mode='cluster'`仅支持集群版, 通过spark插入数据,支持同步或异步模式
local模式的使用限制见[local导入模式说明](#local导入模式说明) | | thread | Integer | 1 | 仅在本地文件导入时生效,即`load_mode='local'`或者单机版,表示本地插入数据的线程数。 最大值为`50`。 | | writer_type | String | single | 集群版在线导入中插入数据的writer类型。可选值为`single`和`batch`,默认为`single`。`single`表示数据即读即写,节省内存。`batch`则是将整个rdd分区读完,确认数据类型有效性后,再写入集群,需要更多内存。在部分情况下,`batch`模式有利于筛选未写入的数据,方便重试这部分数据。 | +| put_if_absent | Boolean | false | 在源数据无重复行也不与表中已有数据重复时,可以使用此选项避免插入重复数据,特别是job失败后可以重试。等价于使用`INSERT OR IGNORE`。更多详情见下文。 | ```{note} 在集群版中,`LOAD DATA INFILE`语句会根据当前执行模式(execute_mode)决定将数据导入到在线或离线存储。单机版中没有存储区别,只会导入到在线存储中,同时也不支持`deep_copy`选项。 @@ -73,6 +74,7 @@ FilePathPattern 所以,请尽量使用绝对路径。单机测试中,本地文件用`file://`开头;生产环境中,推荐使用hdfs等文件系统。 ``` + ## SQL语句模版 ```sql @@ -119,7 +121,36 @@ LOAD DATA INFILE 'hive://db1.t1' INTO TABLE t1; ## 离线导入规则 -表的离线信息可通过`desc `查看。我们将数据地址分为两类,离线地址是OpenMLDB的内部存储路径,硬拷贝将写入此地址,仅一个;软链接地址是软链接导入的地址列表。 +表的离线信息可通过`desc
`查看。我们将数据地址分为两类,Data path与Symbolic path,离线地址Data path是OpenMLDB的内部存储路径,硬拷贝将写入此地址,仅一个;软链接地址Symbolic path,则是软链接导入的地址列表,可以是多个。 +``` + --- ------- ----------- ------ --------- + # Field Type Null Default + --- ------- ----------- ------ --------- + 1 c1 Varchar YES + 2 c2 Int YES + 3 c3 BigInt YES + 4 c4 Float YES + 5 c5 Double YES + 6 c6 Timestamp YES + 7 c7 Date YES + --- ------- ----------- ------ --------- + --- -------------------- ------ ---- ------ --------------- + # name keys ts ttl ttl_type + --- -------------------- ------ ---- ------ --------------- + 1 INDEX_0_1705743486 c1 - 0min kAbsoluteTime + --- -------------------- ------ ---- ------ --------------- + ---------------------------------------------------------- ------------------------------------------ --------- --------- + Data path Symbolic paths Format Options + ---------------------------------------------------------- ------------------------------------------ --------- --------- + file:///tmp/openmldb_offline_storage/demo_db/demo_table1 file:///work/taxi-trip/data/data.parquet parquet + ---------------------------------------------------------- ------------------------------------------ --------- --------- + + --------------- -------------- + compress_type storage_mode + --------------- -------------- + NoCompress Memory + --------------- -------------- +``` 根据模式的不同,对离线信息的修改也不同。 - overwrite模式,将会覆盖原有的所有字段,包括离线地址、软链接地址、格式、读取选项,仅保留当前overwrite进入的信息。 - overwrite 硬拷贝,离线地址如果存在数据将被覆盖,软链接全部清空,格式更改为内部默认格式parquet,读取选项全部清空。 @@ -142,19 +173,46 @@ curl http:///NameServer/UpdateOfflineTableInfo -d '{"db":" ## CSV源数据格式说明 导入支持csv和parquet两种数据格式,csv的格式需要特别注意,下面举例说明。 - -``` -c1, c2 -, -"","" -ab,cd -"ef","gh" -null,null -``` -这个csv源数据中,第一行两个空值(blank value)。 -- cluster模式空值会被当作`null`(无论null_value是什么)。 -- local模式空值会被当作空字符串,具体见[issue3015](https://github.com/4paradigm/OpenMLDB/issues/3015)。 - -第二行两列都是两个双引号。 -- cluster模式默认quote为`"`,所以这一行是两个空字符串。 -- local模式默认quote为`\0`,所以这一行两列都是两个双引号。local模式quote可以配置为`"`,但escape规则是`""`为单个`"`,和Spark不一致,具体见[issue3015](https://github.com/4paradigm/OpenMLDB/issues/3015)。 +1. csv的列分隔符默认为`,`,不允许出现空格,否则,"a, b"将被解析为两列,第一列为`a`,第二列为` b`(有一个空格)。 + 1. local模式会trim掉列分隔符两边的空格,所以`a, b`会被解析为两列,第一列为`a`,第二列为`b`。但从规范上来说,csv的列分隔符左右不应该有空格,请不要依赖这个特性。 +2. cluster和local模式对于空值的处理不同,具体为: + ``` + c1, c2 + , + "","" + ab,cd + "ef","gh" + null,null + ``` + 这个csv源数据中,第一行两个空值(blank value)。 + - cluster模式空值会被当作`null`(无论null_value是什么)。 + - local模式空值会被当作空字符串,具体见[issue3015](https://github.com/4paradigm/OpenMLDB/issues/3015)。 + + 第二行两列都是两个双引号。 + - cluster模式默认quote为`"`,所以这一行是两个空字符串。 + - local模式默认quote为`\0`,所以这一行两列都是两个双引号。local模式quote可以配置为`"`,但escape规则是`""`为单个`"`,和Spark不一致,具体见[issue3015](https://github.com/4paradigm/OpenMLDB/issues/3015)。 + +3. cluster的csv格式支持两种格式的timestamp,但同一次load只会选择一种格式,不会混合使用。如果csv中存在两种格式的timestamp,会导致解析失败。选择哪种格式由第一行数据决定,如果第一行数据是`2020-01-01 00:00:00`,则后续所有timestamp都会按照`yyyy-MM-dd HH:mm:ss`格式解析;如果第一行数据是整型`1577808000000`,则后续所有timestamp都会按照整型格式解析。 + 1. timestamp可以为字符串格式,比如`"2020-01-01 00:00:00"`。 + 2. date可以是年月日(`yyyy-MM-dd`)或者年月日时分秒(`yyyy-MM-dd HH:mm:ss`)。 +4. local的csv格式只支持整型timestamp,date类型为年月日,例如`2022-2-2`。 + 1. timestamp和date均不可以为字符串格式,比如`"2020-01-01"`将解析失败。 + 2. date不可以是年月日时分秒,例如`2022-2-2 00:00:00`将解析失败。 +5. local的字符串不支持quote转义,所以如果你的字符串中存在quote字符,请使用cluster模式。 +6. cluster如果读取csv时解析失败,将会把失败的列值设为NULL,继续导入流程,但local模式会直接报错,不会继续导入。 + +## PutIfAbsent说明 + +PutIfAbsent是一个特殊的选项,它可以避免插入重复数据,仅需一个配置,操作简单,特别适合load datajob失败后重试,等价于使用`INSERT OR IGNORE`。如果你想要导入的数据中存在重复,那么通过PutIfAbsent导入,会导致部分数据丢失。如果你需要保留重复数据,不应使用此选项,建议通过其他方式去重后再导入。local模式暂不支持此选项。 + +PutIfAbsent需要去重这一额外开销,所以,它的性能与去重的复杂度有关: + +- 表中只存在ts索引,且同一key+ts的数据量少于10k时(为了精确去重,在同一个key+ts下会逐行对比整行数据),PutIfAbsent的性能表现不会很差,通常导入时间在普通导入时间的2倍以内。 +- 表中如果存在time索引(ts列为空),或者ts索引同一key+ts的数据量大于100k时,PutIfAbsent的性能会很差,导入时间可能超过普通导入时间的10倍,无法正常使用。这样的数据条件下,更建议进行去重后再导入。 + +## local导入模式说明 + +load_mode可使用local模式,但与cluster模式有一些不同,如果你部署了TaskManager,我们建议使用cluster模式。不同之处如下: + +- local模式仅支持在线,不支持离线。也只支持csv格式,不支持parquet格式。 +- csv的读取支持有限,(SplitLineWithDelimiterForStrings) diff --git a/docs/zh/openmldb_sql/notice.md b/docs/zh/openmldb_sql/notice.md index c9705ca3a1f..43629fd0b3e 100644 --- a/docs/zh/openmldb_sql/notice.md +++ b/docs/zh/openmldb_sql/notice.md @@ -7,7 +7,7 @@ | CREATE TABLE | 1. 在建表语句中如果没有指定索引,默认会自动创建一个`absolute 0`的索引。这个索引下的数据永不过期,可能会占用大量内存
2. 磁盘表`absandlat`和`absorlat`类型没有过期删除 | DROP TABLE | 1. 删除表默认是异步操作,执行完成后,异步删除表中的数据
2. 如果有分片在做snapshot, 会删除失败。可能存在部分分片删除部分没有删除的情况
3. 删除时默认会把数据目录放到recycle目录下。tablet的配置文件中`recycle_bin_enabled`参数可以配置是否要放到recycle, 默认是开启的
4. 由于内存碎片问题,释放的内存不一定完全释放给操作系统 | INSERT | 如果返回失败,可能有一部分数据已经插入进去 -| DELETE | 1. 删除的数据不会立马从内存中物理删除,需要等一个过期删除时间间隔(即参数 `gc_interval`)
2. 如果设置了长窗口,不会更新预聚合表里的数据 +| DELETE | 删除的数据不会立马从内存中物理删除,需要等一个过期删除时间间隔(即参数 `gc_interval`) | CREATE INDEX | 1. 创建索引是一个异步操作,如果表里有数据需要等一段时间 `desc` 命令才能显示出来
2. 在创建索引的过程中如果有写操作,那么可能会有部分新写入的数据在新加的索引上查询不出来
3. 磁盘表不支持创建索引 | DROP INDEX | 1. 删除一个索引之后,如果要再重新创建相同的索引需要等两个过期删除时间间隔(及参数 `gc_interval`)
2. 执行该命令后,内存中的索引并没有被真正的马上删除,需要等两个过期删除时间间隔才会在内存中真正被执行删除动作
3. 磁盘表不支持删除索引 | DEPLOY | 1. DEPLOY 命令可能会修改相关表的TTL,执行DEPLOY前导入的数据可能在新TTL生效前被淘汰,新的TTL生效时间为2个`gc_interval`
2. 在deployment关联的表中,如果有磁盘表需要添加索引,那么部署会失败,可能有部分索引已经添加成功 diff --git a/docs/zh/openmldb_sql/udf_develop_guide.md b/docs/zh/openmldb_sql/udf_develop_guide.md index 89771df4b5a..9f7e8eafb20 100644 --- a/docs/zh/openmldb_sql/udf_develop_guide.md +++ b/docs/zh/openmldb_sql/udf_develop_guide.md @@ -16,14 +16,14 @@ 内置C++函数的参数类型限定为:BOOL类型,数值类型,时间戳日期类型和字符串类型。C++类型SQL类型对应关系如下: -| SQL类型 | C/C++ 类型 | -| :-------- | :----------------- | -| BOOL | `bool` | -| SMALLINT | `int16_t` | -| INT | `int32_t` | -| BIGINT | `int64_t` | -| FLOAT | `float` | -| DOUBLE | `double` | +| SQL类型 | C/C++ 类型 | +| :-------- | :---------- | +| BOOL | `bool` | +| SMALLINT | `int16_t` | +| INT | `int32_t` | +| BIGINT | `int64_t` | +| FLOAT | `float` | +| DOUBLE | `double` | | STRING | `StringRef` | | TIMESTAMP | `Timestamp` | | DATE | `Date` | @@ -44,6 +44,15 @@ }; ``` +- 如果参数声明为nullable的,那么所有参数都是nullable的,每一个输入参数其后需要添加bool参数(通常命名为is_null),其顺序为`arg1, arg1_is_null, arg2, arg2_is_null, ...`。不可以随意调整参数顺序。 +- 如果返回值声明为nullable的,那么通过参数来返回,并且添加bool参数(通常命名为is_null)来表示返回值是否为null + +例如,函数sum有俩个参数,如果参数和返回值设置为nullable的话,单行函数原型如下: +```c++ +extern "C" +void sum(::openmldb::base::UDFContext* ctx, int64_t input1, bool input1_is_null, int64_t input2, bool input2_is_null, int64_t* output, bool* is_null) { +``` + 函数声明: * 函数必须用extern "C"来声明 @@ -57,17 +66,6 @@ ``` - 一次分配空间的最大长度不能超过2M字节 -**注**: - -- 如果参数声明为nullable的,那么所有参数都是nullable的,每一个输入参数都添加is_null参数 -- 如果返回值声明为nullable的,那么通过参数来返回,并且添加is_null的参数来表示返回值是否为null - -如函数sum有俩个参数,如果参数和返回值设置为nullable的话,单行函数原型如下: -```c++ -extern "C" -void sum(::openmldb::base::UDFContext* ctx, int64_t input1, bool is_null, int64_t input2, bool is_null, int64_t* output, bool* is_null) { -``` - #### 单行函数开发 单行函数(scalar function)对单行数据进行处理,返回单个值,比如 `abs`, `sin`, `cos`, `date`, `year` 等。 @@ -95,6 +93,8 @@ void cut2(::openmldb::base::UDFContext* ctx, ::openmldb::base::StringRef* input, } ``` +因为返回值是string类型,所以此处需要通过函数最后一个参数返回。如果返回值是基本类型,通过函数返回值返回,可参考[test_udf.cc](https://github.com/4paradigm/OpenMLDB/blob/main/src/examples/test_udf.cc)中的strlength。 + #### 聚合函数开发 聚合函数(aggregate function)对一个数据集(比如一列数据)执行计算,返回单个值,比如 `sum`, `avg`, `max`, `min`, `count` 等。 @@ -141,9 +141,75 @@ int64_t special_sum_output(::openmldb::base::UDFContext* ctx) { return *(reinterpret_cast(ctx->ptr)) + 5; } +// Get the third non-null value of all values +extern "C" +::openmldb::base::UDFContext* third_init(::openmldb::base::UDFContext* ctx) { + ctx->ptr = reinterpret_cast(new std::vector()); + return ctx; +} + +extern "C" +::openmldb::base::UDFContext* third_update(::openmldb::base::UDFContext* ctx, int64_t input, bool is_null) { + auto vec = reinterpret_cast*>(ctx->ptr); + if (!is_null && vec->size() < 3) { + vec->push_back(input); + } + return ctx; +} + +extern "C" +void third_output(::openmldb::base::UDFContext* ctx, int64_t* output, bool* is_null) { + auto vec = reinterpret_cast*>(ctx->ptr); + if (vec->size() != 3) { + *is_null = true; + } else { + *is_null = false; + *output = vec->at(2); + } + // free the memory allocated in init function with new/malloc + delete vec; +} + +// Get the first non-null value >= threshold +extern "C" +::openmldb::base::UDFContext* first_ge_init(::openmldb::base::UDFContext* ctx) { + // threshold init in update + // threshold, thresh_flag, first_ge, first_ge_flag + ctx->ptr = reinterpret_cast(new std::vector(4, 0)); + return ctx; +} + +extern "C" +::openmldb::base::UDFContext* first_ge_update(::openmldb::base::UDFContext* ctx, int64_t input, bool is_null, int64_t threshold, bool threshold_is_null) { + auto pair = reinterpret_cast*>(ctx->ptr); + if (!threshold_is_null && pair->at(1) == 0) { + pair->at(0) = threshold; + pair->at(1) = 1; + } + if (!is_null && pair->at(3) == 0 && input >= pair->at(0)) { + pair->at(2) = input; + pair->at(3) = 1; + } + return ctx; +} + +extern "C" +void first_ge_output(::openmldb::base::UDFContext* ctx, int64_t* output, bool* is_null) { + auto pair = reinterpret_cast*>(ctx->ptr); + // threshold is null or no value >= threshold + if (pair->at(1) == 0 || pair->at(3) == 0) { + *is_null = true; + } else { + *is_null = false; + *output = pair->at(2); + } + // *is_null = true; + // free the memory allocated in init function with new/malloc + delete pair; +} ``` -更多udf/udaf实现参考[这里](../../../src/examples/test_udf.cc)。 +如上所示,聚合函数init函数仅单参数,无论是几个参数的聚合函数,init中都只有一个参数UDFContext。update函数参数个数和类型,与聚合函数的参数个数和类型一致。同样的,如果想要聚合函数nullable,每个参数都需要添加一个bool参数,表示该参数是否为null。output函数只会有一个输出参数或返回值,nullable同理。更多udf/udaf实现参考[这里](../../../src/examples/test_udf.cc)。 ### 编译动态库 - 拷贝include目录 `https://github.com/4paradigm/OpenMLDB/tree/main/include` 到某个路径下,下一步编译会用到。如/work/OpenMLDB/ @@ -185,15 +251,15 @@ g++ -shared -o libtest_udf.so examples/test_udf.cc -I /work/OpenMLDB/include -st ### 注册、删除和查看函数 注册函数使用[CREATE FUNCTION](../openmldb_sql/ddl/CREATE_FUNCTION.md) -注册单行函数 +注册单行函数,cut2函数将字符串的前两个字符返回: ```sql CREATE FUNCTION cut2(x STRING) RETURNS STRING OPTIONS (FILE='libtest_udf.so'); ``` -注册聚合函数 +注册聚合函数,special_sum函数初始为10,再将输入的值累加,并且最后加上5返回(演示函数,无特殊意义): ```sql CREATE AGGREGATE FUNCTION special_sum(x BIGINT) RETURNS BIGINT OPTIONS (FILE='libtest_udf.so'); ``` -注册聚合函数,并且输入参数和返回值都支持null +注册聚合函数,并且输入参数和返回值都支持null,third函数返回第三个非null的值,如果非null的值不足3个,返回null: ```sql CREATE AGGREGATE FUNCTION third(x BIGINT) RETURNS BIGINT OPTIONS (FILE='libtest_udf.so', ARG_NULLABLE=true, RETURN_NULLABLE=true); ``` @@ -212,3 +278,9 @@ SHOW FUNCTIONS; ``` DROP FUNCTION cut2; ``` + +```{warning} +同一个udf so如果注册了多个函数,只删除一个函数时,该so不会从Tablet Server内存中删除。此时替换so文件是无用的,并且如果此时增删udf,有一定危险影响Tablet Server运行。 + +建议:**删除所有udf后,替换udf so文件**。 +``` \ No newline at end of file diff --git a/docs/zh/quickstart/beginner_must_read.md b/docs/zh/quickstart/beginner_must_read.md index 60522283942..ad403a6b423 100644 --- a/docs/zh/quickstart/beginner_must_read.md +++ b/docs/zh/quickstart/beginner_must_read.md @@ -69,6 +69,16 @@ OpenMLDB是在线离线存储计算分离的,所以,你需要明确自己导 关于如何设计你的数据流入流出,可参考[实时决策系统中 OpenMLDB 的常见架构整合方式](../tutorial/app_arch.md)。 +### 在线表 + +在线表是存在内存中的数据,同时也会使用硬盘进行备份恢复。在线表的数据,可以通过`select count(*) from t1`来检查条数,或者使用`show table status`来查看表状态(可能有一定延迟,可以稍等再查)。 + +在线表是可以有多个索引的,通过`desc
`可以查看。写入一条数据时每个索引中都会写入一条,区别是各个索引的分类排序不同。但由于索引还有TTL淘汰机制,各个索引的数据量可能不一致。`select count(*) from t1`和`show table status`的结果是第一个索引的数据量,它并不代表其他索引的数据量。SQL查询会使用哪一个索引,是由SQL Engine选择的最优索引,可以通过SQL物理计划来查看。 + +建表时,可以指定索引,也可以不指定,不指定时,会默认创建一个索引。如果是默认索引,它无ts列(用当前time作为排序列,我们称为time索引)将会永不淘汰数据,可以以它为标准检查数据量是否准确,但这样的索引会占用太多的内存,目前也不可以删除第一条索引(计划未来支持),可以通过NS Client修改TTL淘汰数据,减少它的内存占用。 + +time索引(无ts的索引)还会影响PutIfAbsent导入。如果你的数据导入可能中途失败,无其他方法进行删除或去重,想要使用PutIfAbsent来进行导入重试时,请参考[PutIfAbsent说明](../openmldb_sql/dml/LOAD_DATA_STATEMENT.md#putifabsent说明)对自己的数据进行评估,避免PutIfAbsent效率太差。 + ## 源数据 ### LOAD DATA diff --git a/docs/zh/quickstart/cli.md b/docs/zh/quickstart/cli.md index fb644b32a6c..97acd3209ac 100644 --- a/docs/zh/quickstart/cli.md +++ b/docs/zh/quickstart/cli.md @@ -29,6 +29,8 @@ bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_cli - sync_job_timeout: CLI执行离线同步任务的默认同步等待时间0.5h,如果离线同步任务需要等待更长的时间,可改变这一配置,但注意还需要改变集群中TaskManager的配置,详情见[离线命令配置详情](../openmldb_sql/ddl/SET_STATEMENT.md#离线命令配置详情)。 - zk_log_level & zk_log_file: CLI连接ZooKeeper产生的日志默认是不打印的,如果需要展示日志,可以调整`zk_log_level`。打印的日志默认是打印到stderr,且由于ZooKeeper连接是后台线程,可能出现CLI交互界面突然出现ZooKeeper相关的日志,不影响CLI的使用但影响界面展示,可以使用`zk_log_file`将ZooKeeper相关的日志输出到文件中。 - zk_session_timeout: 期望的ZooKeeper session超时时间,并不一定是真实的session超时时间。如果调整过大,也需要调整ZooKeeper Server的tickTime或maxSessionTimeout。 +- user: 指定登录用的用户名。如果不指定默认为root。 +- password: 指定登录用的密码。如果不指定,需要再交互模式下输入密码。 ## 非交互式使用方法 diff --git a/docs/zh/quickstart/function_boundary.md b/docs/zh/quickstart/function_boundary.md index 30fa4285a59..ea0d4ae9be8 100644 --- a/docs/zh/quickstart/function_boundary.md +++ b/docs/zh/quickstart/function_boundary.md @@ -82,63 +82,6 @@ spark.default.conf=spark.port.maxRetries=32;foo=bar 单个任务最大的并发数限制为`spark.executor.instances`*`spark.executor.cores`,请调整这两个配置。当spark.master=local时,调整driver的,而不是executor的。 -### DELETE - -在线存储的表有多索引,`DELETE` 可能无法删除所有索引中的对应数据,所以,可能出现删除了数据,却能查出已删除数据的情况。 - -举例: - -```SQL -create database db; -use db; -create table t1(c1 int, c2 int,index(key=c1),index(key=c2)); -desc t1; -set @@execute_mode='online'; -insert into t1 values (1,1),(2,2); -delete from t1 where c2=2; -select * from t1; -select * from t1 where c2=2; -``` - -结果如下: - -```Plain - --- ------- ------ ------ --------- - Field Type Null Default - --- ------- ------ ------ --------- - 1 c1 Int YES - 2 c2 Int YES - --- ------- ------ ------ --------- - --- -------------------- ------ ---- ------ --------------- - name keys ts ttl ttl_type - --- -------------------- ------ ---- ------ --------------- - 1 INDEX_0_1668504212 c1 - 0min kAbsoluteTime - 2 INDEX_1_1668504212 c2 - 0min kAbsoluteTime - --- -------------------- ------ ---- ------ --------------- - -------------- - storage_mode - -------------- - Memory - -------------- - ---- ---- - c1 c2 - ---- ---- - 1 1 - 2 2 - ---- ---- - -2 rows in set - ---- ---- - c1 c2 - ---- ---- - -0 rows in set -``` - -说明: - -表 `t1` 有多个索引(`DEPLOY` 也可能自动创建出多索引),`delete from t1 where c2=2` 实际只删除了第二个 index 的数据,第一个 index 数据没有被影响。这是因为delete的where condition只与第二个index相关,第一个index中没有任何该condition相关的key或ts。而 `select * from t1` 使用第一个索引,并非第二个,结果就会有两条数据,直观感受为delete失败了;`select * from t1 where c2=2` 使用第二个索引,结果为空,证明数据在该索引下已被删除。 - ## DQL 边界 根据执行模式的不同,支持的查询模式(即 `SELECT` 语句)也有所不同: diff --git a/docs/zh/quickstart/openmldb_quickstart.md b/docs/zh/quickstart/openmldb_quickstart.md index c9a0dee18a8..12ec5724f16 100644 --- a/docs/zh/quickstart/openmldb_quickstart.md +++ b/docs/zh/quickstart/openmldb_quickstart.md @@ -38,12 +38,16 @@ docker run -it 4pdosc/openmldb:0.8.4 bash ```bash /work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client +# 或脚本 +/work/openmldb/sbin/openmldb-cli.sh ``` 成功启动 OpenMLDB CLI 后如下图显示: ![image](./images/cli_cluster.png) +如果你需要对 OpenMLDB 集群进行配置修改,/work/init.sh使用的是sbin一键部署方式,具体参考[一键部署](../deploy/install_deploy.md#部署方式一一键部署推荐)。 + ## 使用流程 参照核心概念,使用 OpenMLDB 的流程一般包含:建立数据库和表、导入离线数据、离线特征计算、SQL 方案上线、导入在线数据、在线实时特征计算六个步骤。 @@ -146,8 +150,6 @@ SET @@execute_mode='online'; SELECT * FROM demo_table1 LIMIT 10; ``` -注意,目前要求成功完成 SQL 上线部署后,才能导入在线数据;如果先导入在线数据,会导致部署出错。 - ```{note} 本篇教程在数据导入以后,略过了实时数据接入的步骤。在实际场景中,由于现实时间的推移,需要将最新的实时数据更新到在线数据库。具体可以通过 OpenMLDB SDK 或者在线数据源 connector 实现(如 Kafka、Pulsar 等)。 ``` diff --git a/docs/zh/quickstart/sdk/java_sdk.md b/docs/zh/quickstart/sdk/java_sdk.md index 5503cc97fd1..e2c11b5f5b8 100644 --- a/docs/zh/quickstart/sdk/java_sdk.md +++ b/docs/zh/quickstart/sdk/java_sdk.md @@ -55,6 +55,9 @@ Connection connection = DriverManager.getConnection("jdbc:openmldb:///?zk=localh // Set database in jdbcUrl Connection connection1 = DriverManager.getConnection("jdbc:openmldb:///test_db?zk=localhost:6181&zkPath=/openmldb"); + +// Set user and password in jdbcUrl +Connection connection = DriverManager.getConnection("jdbc:openmldb:///?zk=localhost:6181&zkPath=/openmldb&user=root&password=123456"); ``` Connection 地址指定的 db 在创建连接时必须存在。 @@ -116,6 +119,10 @@ option.setZkCluster("127.0.0.1:2181"); option.setZkPath("/openmldb"); option.setSessionTimeout(10000); option.setRequestTimeout(60000); +// 如果不指定用户名,默认是root +option.setUser("root"); +// 如果不指定密码,默认是空 +option.setPassword("123456"); ``` 然后使用 SdkOption 创建 Executor。 diff --git a/docs/zh/quickstart/sdk/python_sdk.md b/docs/zh/quickstart/sdk/python_sdk.md index 69544b81db7..38bf23736e9 100644 --- a/docs/zh/quickstart/sdk/python_sdk.md +++ b/docs/zh/quickstart/sdk/python_sdk.md @@ -21,6 +21,8 @@ pip install openmldb ```python import openmldb.dbapi db = openmldb.dbapi.connect(zk="$zkcluster", zkPath="$zkpath") +# 可以设置用户名和密码。如果不设置用户名,默认为root。密码默认为空 +# db = openmldb.dbapi.connect(zk="$zkcluster", zkPath="$zkpath", user="$user", password="$password") cursor = db.cursor() ``` @@ -116,8 +118,10 @@ cursor.close() ### 创建连接 -``` +```python create_engine('openmldb:///db_name?zk=zkcluster&zkPath=zkpath') +# 可以通过如下方式指定用户名密码 +# create_engine('openmldb:///db_name?zk=zkcluster&zkPath=zkpath&user=root&password=123456') ``` 参数 db_name 必须存在,需在创建连接前创建数据库。或者先创建无数据库的连接,再通过 `execute("USE ")` 命令设置使用数据库 `db`。 diff --git a/docs/zh/quickstart/sdk/rest_api.md b/docs/zh/quickstart/sdk/rest_api.md index 8e088131369..4db364b4c90 100644 --- a/docs/zh/quickstart/sdk/rest_api.md +++ b/docs/zh/quickstart/sdk/rest_api.md @@ -34,7 +34,7 @@ ``` - 目前仅支持插入一条数据。 -- 数据需严格按照 schema 排列。 +- 数据需严格按照表 schema 排列。 请求数据样例: @@ -152,13 +152,13 @@ curl http://127.0.0.1:8080/dbs/demo_db/deployments/demo_data_service -X POST -d' 请求参数: -| **参数** | **类型** | **必需** | **说明** | -| -------- | -------- | -------- | ------------------------------------------------------------ | -| mode | String | 是 | 可配 `offsync` , `offasync`, `online` | -| sql | String | 是 | | -| input | Object | 否 | | -| schema | Array | 否 | 可支持数据类型(大小写不敏感):`Bool`, `Int16`, `Int32`, `Int64`, `Float`, `Double`, `String`, `Date` and `Timestamp`. | -| data | Array | 否 | | +| **参数** | **类型** | **必需** | **说明** | +| -------- | -------- | -------- | ----------------------------------------------------------------------------------------------------------------------- | +| mode | String | 是 | 可配 `offsync` , `offasync`, `online` | +| sql | String | 是 | | +| input | Object | 否 | | +| schema | Array | 否 | 可支持数据类型(大小写不敏感):`Bool`, `Int16`, `Int32`, `Int64`, `Float`, `Double`, `String`, `Date` and `Timestamp`. | +| data | Array | 否 | schema和data字段必须同时存在 | **请求数据样例** diff --git a/docs/zh/tutorial/tutorial_sql_1.md b/docs/zh/tutorial/tutorial_sql_1.md index aa73927ace7..bbe618bf384 100644 --- a/docs/zh/tutorial/tutorial_sql_1.md +++ b/docs/zh/tutorial/tutorial_sql_1.md @@ -144,6 +144,7 @@ window window_name as (PARTITION BY partition_col ORDER BY order_col ROWS_RANGE - OpenMLDB 的下界条数必须<=上界条数 更多语法和特性可以参考 [OpenMLDB窗口参考手册](../openmldb_sql/dql/WHERE_CLAUSE.md)。 + #### 示例 对于上面所示的交易表 t1,我们定义两个时间窗口和两个条数窗口。每一个样本行的窗口均按用户ID(`uid`)分组,按交易时间(`trans_time`)排序。下图展示了分组排序后的数据。 ![img](images/table_t1.jpg) @@ -240,7 +241,7 @@ xxx_cate(col, cate) over w - 参数`col`:参与聚合计算的列。 - 参数`cate`:分组列。 -目前支持的带有 _cate 后缀的聚合函为:`count_cate`, `sum_cate`, `avg_cate`, `max_cate`, `min_cate` +目前支持的带有 `_cate` 后缀的聚合函为:`count_cate`, `sum_cate`, `avg_cate`, `max_cate`, `min_cate` 相关示例如下: diff --git a/docs/zh/tutorial/tutorial_sql_2.md b/docs/zh/tutorial/tutorial_sql_2.md index 6e1658ad228..913b10a161d 100644 --- a/docs/zh/tutorial/tutorial_sql_2.md +++ b/docs/zh/tutorial/tutorial_sql_2.md @@ -64,7 +64,7 @@ SELECT * FROM s1 LAST JOIN s2 ORDER BY s2.std_ts ON s1.col1 = s2.col1; ## 3. 副表多行聚合特征 -OpenMLDB 针对副表拼接场景,扩展了标准的 WINDOW 语法,新增了 [WINDOW UNION](../openmldb_sql/dql/WINDOW_CLAUSE.md#windowunion) 的特性,支持从副表拼接多条数据形成副表窗口。在副表拼接窗口的基础上,可以方便构建副表多行聚合特征。同样地,构造副表多行聚合特征也需要完成两个步骤: +OpenMLDB 针对副表拼接场景,扩展了标准的 WINDOW 语法,新增了 [WINDOW UNION](../openmldb_sql/dql/WINDOW_CLAUSE.md#1-window--union) 的特性,支持从副表拼接多条数据形成副表窗口。在副表拼接窗口的基础上,可以方便构建副表多行聚合特征。同样地,构造副表多行聚合特征也需要完成两个步骤: - 步骤一:定义副表拼接窗口。 - 步骤二:在副表拼接窗口上构造副表多行聚合特征。 diff --git a/extensions/airflow-provider-openmldb/tests/hooks/test_openmldb_api_hook.py b/extensions/airflow-provider-openmldb/tests/hooks/test_openmldb_api_hook.py index 2dcd9f447f1..b3714ade402 100644 --- a/extensions/airflow-provider-openmldb/tests/hooks/test_openmldb_api_hook.py +++ b/extensions/airflow-provider-openmldb/tests/hooks/test_openmldb_api_hook.py @@ -120,7 +120,7 @@ def test_query_api_server_without_data(self): # no data response = hook.run() res = json.loads(response.text) - assert res == {'code': -1, 'msg': 'Json parse failed'} + assert res == {'code': -1, 'msg': 'Request body json parse failed'} def test_query_api_server_with_sql(self): hook = OpenMLDBHook() @@ -133,7 +133,7 @@ def test_query_api_server_without_mode(self): response = hook.run(data='{"sql":"select 1"}') res = json.loads(response.text) assert res['code'] == -1 - assert res['msg'].startswith('Json parse failed') + assert res['msg'].startswith('Request body json parse failed') def test_query_api_server(self): hook = OpenMLDBHook() diff --git a/extensions/kafka-connect-jdbc/src/main/java/io/confluent/connect/jdbc/dialect/OpenmldbDatabaseDialect.java b/extensions/kafka-connect-jdbc/src/main/java/io/confluent/connect/jdbc/dialect/OpenmldbDatabaseDialect.java index 75b3d92cb50..29e5b612b7f 100644 --- a/extensions/kafka-connect-jdbc/src/main/java/io/confluent/connect/jdbc/dialect/OpenmldbDatabaseDialect.java +++ b/extensions/kafka-connect-jdbc/src/main/java/io/confluent/connect/jdbc/dialect/OpenmldbDatabaseDialect.java @@ -196,7 +196,7 @@ protected String addFieldToSchema(final ColumnDefinition columnDefn, final Schem default: { } } if (schemaBuilder == null) { - log.warn("openmldb schema builder for sqlType {} is null, " + log.debug("openmldb schema builder for sqlType {} is null, " + "use GenericDatabaseDialect method", sqlType); return super.addFieldToSchema(columnDefn, builder, fieldName, sqlType, optional); diff --git a/hybridse/include/base/fe_status.h b/hybridse/include/base/fe_status.h index b91b8d8fb16..8f11a16a8c8 100644 --- a/hybridse/include/base/fe_status.h +++ b/hybridse/include/base/fe_status.h @@ -16,11 +16,12 @@ #ifndef HYBRIDSE_INCLUDE_BASE_FE_STATUS_H_ #define HYBRIDSE_INCLUDE_BASE_FE_STATUS_H_ + +#include #include #include -#include "glog/logging.h" + #include "proto/fe_common.pb.h" -#include "proto/fe_type.pb.h" namespace hybridse { namespace base { diff --git a/hybridse/include/node/expr_node.h b/hybridse/include/node/expr_node.h index 442064b6873..490e4d48c28 100644 --- a/hybridse/include/node/expr_node.h +++ b/hybridse/include/node/expr_node.h @@ -18,7 +18,6 @@ #define HYBRIDSE_INCLUDE_NODE_EXPR_NODE_H_ #include -#include #include "base/fe_status.h" #include "codec/fe_row_codec.h" diff --git a/hybridse/include/node/node_base.h b/hybridse/include/node/node_base.h index 8aa678c90a8..c6894f2b682 100644 --- a/hybridse/include/node/node_base.h +++ b/hybridse/include/node/node_base.h @@ -22,7 +22,6 @@ #include #include "base/fe_object.h" -#include "glog/logging.h" #include "node/node_enum.h" namespace hybridse { diff --git a/hybridse/include/node/node_enum.h b/hybridse/include/node/node_enum.h index 7c9ebf0ecbe..ad8f03bf422 100644 --- a/hybridse/include/node/node_enum.h +++ b/hybridse/include/node/node_enum.h @@ -17,9 +17,6 @@ #ifndef HYBRIDSE_INCLUDE_NODE_NODE_ENUM_H_ #define HYBRIDSE_INCLUDE_NODE_NODE_ENUM_H_ -#include -#include "proto/fe_common.pb.h" -#include "proto/fe_type.pb.h" namespace hybridse { namespace node { @@ -98,6 +95,9 @@ enum SqlNodeType { kAlterTableStmt, kShowStmt, kCompressType, + kColumnSchema, + kCreateUserStmt, + kAlterUserStmt, kSqlNodeTypeLast, // debug type }; @@ -143,7 +143,8 @@ enum ExprType { kExprIn, kExprEscaped, kExprArray, - kExprFake, // not a real one + kExprArrayElement, // extract value from a array or map, with `[]` operator + kExprFake, // not a real one kExprLast = kExprFake, }; @@ -175,9 +176,21 @@ enum DataType { kArray, // fixed size. In SQL: [1, 2, 3] or ARRAY[1, 2, 3] kDataTypeFake, // not a data type, for testing purpose only kLastDataType = kDataTypeFake, + // the tree type are not moved above kLastDataType for compatibility // it may necessary to do it in the further + + // kVoid + // A distinct data type: signifies no value or meaningful result. + // Typically used for function that does not returns value. kVoid = 100, + // kNull + // A special marker representing the absence of a value. + // Not a true data type but a placeholder for missing or unknown information. + // A `NULL` literal can be eventually resolved to: + // - NULL of void type, if no extra info provided: 'SELECT NULL' + // - NULL of int (or any other) type, extra information provided, e.g with 'CAST' operator + // 'SELECT CAST(NULL as INT)' kNull = 101, kPlaceholder = 102 }; @@ -289,6 +302,8 @@ enum CmdType { kCmdShowJobLog, kCmdShowCreateTable, kCmdTruncate, + kCmdDropUser, + kCmdShowUser, kCmdFake, // not a real cmd, for testing purpose only kLastCmd = kCmdFake, }; @@ -327,6 +342,8 @@ enum PlanType { kPlanTypeWithClauseEntry, kPlanTypeAlterTable, kPlanTypeShow, + kPlanTypeCreateUser, + kPlanTypeAlterUser, kUnknowPlan = -1, }; diff --git a/hybridse/include/node/node_manager.h b/hybridse/include/node/node_manager.h index 6949faf6f88..fdee40b20e9 100644 --- a/hybridse/include/node/node_manager.h +++ b/hybridse/include/node/node_manager.h @@ -21,7 +21,6 @@ #ifndef HYBRIDSE_INCLUDE_NODE_NODE_MANAGER_H_ #define HYBRIDSE_INCLUDE_NODE_NODE_MANAGER_H_ -#include #include #include #include @@ -166,16 +165,12 @@ class NodeManager { SqlNode *MakeInsertTableNode(const std::string &db_name, const std::string &table_name, const ExprListNode *column_names, - const ExprListNode *values); + const ExprListNode *values, InsertStmt::InsertMode insert_mode); CreateStmt *MakeCreateTableNode(bool op_if_not_exist, const std::string &db_name, const std::string &table_name, SqlNodeList *column_desc_list, SqlNodeList *partition_meta_list); - SqlNode *MakeColumnDescNode(const std::string &column_name, - const DataType data_type, - bool op_not_null, - ExprNode* default_value = nullptr); SqlNode *MakeColumnIndexNode(SqlNodeList *keys, SqlNode *ts, SqlNode *ttl, SqlNode *version); SqlNode *MakeColumnIndexNode(SqlNodeList *index_item_list); diff --git a/hybridse/include/node/plan_node.h b/hybridse/include/node/plan_node.h index 3085b27c699..c4fcf3beadb 100644 --- a/hybridse/include/node/plan_node.h +++ b/hybridse/include/node/plan_node.h @@ -722,6 +722,39 @@ class CreateIndexPlanNode : public LeafPlanNode { void Print(std::ostream &output, const std::string &orgTab) const; const CreateIndexNode *create_index_node_; }; + +class CreateUserPlanNode : public LeafPlanNode { + public: + explicit CreateUserPlanNode(const std::string& name, bool if_not_exists, std::shared_ptr options) + : LeafPlanNode(kPlanTypeCreateUser), name_(name), if_not_exists_(if_not_exists), options_(options) {} + ~CreateUserPlanNode() = default; + void Print(std::ostream &output, const std::string &orgTab) const; + const std::string& Name() const { return name_; } + bool IfNotExists() const { return if_not_exists_; } + const std::shared_ptr Options() const { return options_; } + + private: + const std::string name_; + const bool if_not_exists_ = false; + const std::shared_ptr options_; +}; + +class AlterUserPlanNode : public LeafPlanNode { + public: + explicit AlterUserPlanNode(const std::string& name, bool if_exists, std::shared_ptr options) + : LeafPlanNode(kPlanTypeAlterUser), name_(name), if_exists_(if_exists), options_(options) {} + ~AlterUserPlanNode() = default; + void Print(std::ostream &output, const std::string &orgTab) const; + const std::string& Name() const { return name_; } + bool IfExists() const { return if_exists_; } + const std::shared_ptr Options() const { return options_; } + + private: + const std::string name_; + const bool if_exists_ = false; + const std::shared_ptr options_; +}; + class CreateProcedurePlanNode : public MultiChildPlanNode { public: CreateProcedurePlanNode(const std::string &sp_name, const NodePointVector &input_parameter_list, diff --git a/hybridse/include/node/sql_node.h b/hybridse/include/node/sql_node.h index 8d641ad8283..d127fccc71a 100644 --- a/hybridse/include/node/sql_node.h +++ b/hybridse/include/node/sql_node.h @@ -371,12 +371,14 @@ typedef std::vector NodePointVector; // supported as: // - ADD PATH // - DROP PATH +// - SET OPTIONS // all else is unsupported class AlterActionBase : public base::FeBaseObject { public: enum class ActionKind { ADD_PATH = 0, - DROP_PATH + DROP_PATH, + SET_OPTIONS }; explicit AlterActionBase(ActionKind k) : kind_(k) {} @@ -406,6 +408,16 @@ class DropPathAction : public AlterActionBase { std::string target_; }; +class SetOptionsAction : public AlterActionBase { + public: + explicit SetOptionsAction(std::shared_ptr options) + : AlterActionBase(ActionKind::SET_OPTIONS), options_(options) {} + std::string DebugString() const override; + const std::shared_ptr Options() const { return options_; } + + private: + const std::shared_ptr options_; +}; class AlterTableStmt: public SqlNode { public: @@ -450,9 +462,7 @@ class ExprNode : public SqlNode { uint32_t GetChildNum() const { return children_.size(); } const ExprType GetExprType() const { return expr_type_; } - void PushBack(ExprNode *node_ptr) { children_.push_back(node_ptr); } - std::vector children_; void Print(std::ostream &output, const std::string &org_tab) const override; virtual const std::string GetExprString() const; virtual const std::string GenerateExpressionName() const; @@ -542,6 +552,8 @@ class ExprNode : public SqlNode { static Status RlikeTypeAccept(node::NodeManager* nm, const TypeNode* lhs, const TypeNode* rhs, const TypeNode** output); + std::vector children_; + private: const TypeNode *output_type_ = nullptr; bool nullable_ = true; @@ -570,10 +582,26 @@ class ArrayExpr : public ExprNode { Status InferAttr(ExprAnalysisContext *ctx) override; - // array type may specific already in SQL, e.g. ARRAY[1,2,3] + // array type may specified type in SQL already, e.g. ARRAY[1,2,3] TypeNode* specific_type_ = nullptr; }; +// extract value from array or map value, using '[]' operator +class ArrayElementExpr : public ExprNode { + public: + ArrayElementExpr(ExprNode *array, ExprNode *pos); + ~ArrayElementExpr() override {} + + ExprNode *array() const; + ExprNode *position() const; + + void Print(std::ostream &output, const std::string &org_tab) const override; + const std::string GetExprString() const override; + ArrayElementExpr *ShadowCopy(NodeManager *nm) const override; + + Status InferAttr(ExprAnalysisContext *ctx) override; +}; + class FnNode : public SqlNode { public: FnNode() : SqlNode(kFn, 0, 0), indent(0) {} @@ -1836,48 +1864,86 @@ class ResTarget : public SqlNode { NodePointVector indirection_; /* subscripts, field names, and '*', or NIL */ }; +class ColumnSchemaNode : public SqlNode { + public: + ColumnSchemaNode(DataType type, bool attr_not_null, const ExprNode *default_val = nullptr) + : SqlNode(kColumnSchema, 0, 0), type_(type), not_null_(attr_not_null), default_value_(default_val) {} + + ColumnSchemaNode(DataType type, absl::Span generics, bool attr_not_null, + const ExprNode *default_val) + : SqlNode(kColumnSchema, 0, 0), + type_(type), + generics_(generics.begin(), generics.end()), + not_null_(attr_not_null), + default_value_(default_val) {} + ~ColumnSchemaNode() override {} + + DataType type() const { return type_; } + absl::Span generics() const { return generics_; } + bool not_null() const { return not_null_; } + const ExprNode *default_value() const { return default_value_; } + + std::string DebugString() const; + + private: + DataType type_; + std::vector generics_; + bool not_null_; + const ExprNode* default_value_ = nullptr; +}; + class ColumnDefNode : public SqlNode { public: - ColumnDefNode() : SqlNode(kColumnDesc, 0, 0), column_name_(""), column_type_() {} - ColumnDefNode(const std::string &name, const DataType &data_type, bool op_not_null, ExprNode *default_value) - : SqlNode(kColumnDesc, 0, 0), - column_name_(name), - column_type_(data_type), - op_not_null_(op_not_null), - default_value_(default_value) {} + ColumnDefNode(const std::string &name, const ColumnSchemaNode *schema) + : SqlNode(kColumnDesc, 0, 0), column_name_(name), schema_(schema) {} ~ColumnDefNode() {} std::string GetColumnName() const { return column_name_; } - DataType GetColumnType() const { return column_type_; } + DataType GetColumnType() const { return schema_->type(); } - ExprNode* GetDefaultValue() const { return default_value_; } + const ExprNode* GetDefaultValue() const { return schema_->default_value(); } + + bool GetIsNotNull() const { return schema_->not_null(); } - bool GetIsNotNull() const { return op_not_null_; } void Print(std::ostream &output, const std::string &org_tab) const; private: std::string column_name_; - DataType column_type_; - bool op_not_null_; - ExprNode* default_value_ = nullptr; + const ColumnSchemaNode* schema_; }; class InsertStmt : public SqlNode { public: + // ref zetasql ASTInsertStatement + enum InsertMode { + DEFAULT_MODE, // plain INSERT + REPLACE, // INSERT OR REPLACE + UPDATE, // INSERT OR UPDATE + IGNORE // INSERT OR IGNORE + }; + InsertStmt(const std::string &db_name, const std::string &table_name, const std::vector &columns, - const std::vector &values) + const std::vector &values, + InsertMode insert_mode) : SqlNode(kInsertStmt, 0, 0), db_name_(db_name), table_name_(table_name), columns_(columns), values_(values), - is_all_(columns.empty()) {} + is_all_(columns.empty()), + insert_mode_(insert_mode) {} - InsertStmt(const std::string &db_name, const std::string &table_name, const std::vector &values) - : SqlNode(kInsertStmt, 0, 0), db_name_(db_name), table_name_(table_name), values_(values), is_all_(true) {} + InsertStmt(const std::string &db_name, const std::string &table_name, const std::vector &values, + InsertMode insert_mode) + : SqlNode(kInsertStmt, 0, 0), + db_name_(db_name), + table_name_(table_name), + values_(values), + is_all_(true), + insert_mode_(insert_mode) {} void Print(std::ostream &output, const std::string &org_tab) const; const std::string db_name_; @@ -1885,6 +1951,7 @@ class InsertStmt : public SqlNode { const std::vector columns_; const std::vector values_; const bool is_all_; + const InsertMode insert_mode_; }; class StorageModeNode : public SqlNode { @@ -2286,6 +2353,38 @@ class CreateIndexNode : public SqlNode { node::ColumnIndexNode *index_; }; +class CreateUserNode : public SqlNode { + public: + explicit CreateUserNode(const std::string &name, + bool if_not_exists, const std::shared_ptr& options) + : SqlNode(kCreateUserStmt, 0, 0), + name_(name), if_not_exists_(if_not_exists), options_(options) {} + void Print(std::ostream &output, const std::string &org_tab) const; + const std::string& Name() const { return name_; } + bool IfNotExists() const { return if_not_exists_; } + const std::shared_ptr Options() const { return options_; } + + private: + const std::string name_; + bool if_not_exists_; + const std::shared_ptr options_; +}; + +class AlterUserNode : public SqlNode { + public: + explicit AlterUserNode(const std::string &name, bool if_exists, const std::shared_ptr& options) + : SqlNode(kAlterUserStmt, 0, 0), name_(name), if_exists_(if_exists), options_(options) {} + void Print(std::ostream &output, const std::string &org_tab) const; + const std::string& Name() const { return name_; } + bool IfExists() const { return if_exists_; } + const std::shared_ptr Options() const { return options_; } + + private: + const std::string name_; + bool if_exists_ = false; + const std::shared_ptr options_; +}; + class ExplainNode : public SqlNode { public: explicit ExplainNode(const QueryNode *query, node::ExplainType explain_type) diff --git a/hybridse/include/node/type_node.h b/hybridse/include/node/type_node.h index e27ef34ce46..110b6329e59 100644 --- a/hybridse/include/node/type_node.h +++ b/hybridse/include/node/type_node.h @@ -21,6 +21,7 @@ #include #include "codec/fe_row_codec.h" +#include "node/expr_node.h" #include "node/sql_node.h" #include "vm/schemas_context.h" @@ -31,7 +32,7 @@ class NodeManager; class TypeNode : public SqlNode { public: - TypeNode() : SqlNode(node::kType, 0, 0), base_(hybridse::node::kNull) {} + TypeNode() : SqlNode(node::kType, 0, 0), base_(hybridse::node::kVoid) {} explicit TypeNode(hybridse::node::DataType base) : SqlNode(node::kType, 0, 0), base_(base), generics_({}) {} explicit TypeNode(hybridse::node::DataType base, const TypeNode *v1) @@ -48,44 +49,44 @@ class TypeNode : public SqlNode { generics_nullable_({false, false}) {} ~TypeNode() override {} - friend bool operator==(const TypeNode& lhs, const TypeNode& rhs); + friend bool operator==(const TypeNode &lhs, const TypeNode &rhs); + + // Return this node cast as a NodeType. + // Use only when this node is known to be that type, otherwise, behavior is undefined. + template + const NodeType *GetAsOrNull() const { + static_assert(std::is_base_of::value, + "NodeType must be a member of the TypeNode class hierarchy"); + return dynamic_cast(this); + } + + template + NodeType *GetAsOrNull() { + static_assert(std::is_base_of::value, + "NodeType must be a member of the TypeNode class hierarchy"); + return dynamic_cast(this); + } // canonical name for the type // this affect the function generated by codegen - virtual const std::string GetName() const { - std::string type_name = DataTypeName(base_); - if (!generics_.empty()) { - for (auto type : generics_) { - type_name.append("_"); - type_name.append(type->GetName()); - } - } - return type_name; - } + virtual const std::string GetName() const; // readable string representation virtual std::string DebugString() const; - const hybridse::node::TypeNode *GetGenericType(size_t idx) const { - return generics_[idx]; - } + const hybridse::node::TypeNode *GetGenericType(size_t idx) const; bool IsGenericNullable(size_t idx) const { return generics_nullable_[idx]; } size_t GetGenericSize() const { return generics_.size(); } hybridse::node::DataType base() const { return base_; } - const std::vector &generics() const { - return generics_; - } + const std::vector &generics() const { return generics_; } - void AddGeneric(const node::TypeNode *dtype, bool nullable) { - generics_.push_back(dtype); - generics_nullable_.push_back(nullable); - } + void AddGeneric(const node::TypeNode *dtype, bool nullable); void Print(std::ostream &output, const std::string &org_tab) const override; - virtual bool Equals(const SqlNode *node) const; + bool Equals(const SqlNode *node) const override; TypeNode *ShadowCopy(NodeManager *) const override; TypeNode *DeepCopy(NodeManager *) const override; @@ -105,9 +106,22 @@ class TypeNode : public SqlNode { bool IsFloating() const; bool IsGeneric() const; + virtual bool IsMap() const { return false; } + virtual bool IsArray() const { return base_ == kArray; } + static Status CheckTypeNodeNotNull(const TypeNode *left_type); hybridse::node::DataType base_; + + // generics_ not empty if it is a complex data type: + // 1. base = ARRAY, generics = [ element_type ] + // 2. base = MAP, generics = [ key_type, value_type ] + // 3. base = STRUCT, generics = [ fileld_type, ... ] (unimplemented) + // inner types, not exists in SQL level + // 4. base = LIST, generics = [ element_type ] + // 5. base = ITERATOR, generics = [ element_type ] + // 6. base = TUPLE (like STRUCT), generics = [ element_type, ... ] + // 7. ... (might others, undocumented) std::vector generics_; std::vector generics_nullable_; }; @@ -120,9 +134,7 @@ class OpaqueTypeNode : public TypeNode { size_t bytes() const { return bytes_; } - const std::string GetName() const override { - return "opaque<" + std::to_string(bytes_) + ">"; - } + const std::string GetName() const override; OpaqueTypeNode *ShadowCopy(NodeManager *) const override; @@ -173,11 +185,28 @@ class FixedArrayType : public TypeNode { std::string DebugString() const override; FixedArrayType *ShadowCopy(NodeManager *) const override; + bool IsArray() const override { return true; } + private: const TypeNode* ele_ty_; uint64_t num_elements_; }; +class MapType : public TypeNode { + public: + MapType(const TypeNode *key_ty, const TypeNode *value_ty, bool value_not_null = false) ABSL_ATTRIBUTE_NONNULL(); + ~MapType() override; + + bool IsMap() const override { return true; } + + const TypeNode *key_type() const; + const TypeNode *value_type() const; + bool value_nullable() const; + + // test if input args can safely apply to a map function + static absl::StatusOr InferMapType(NodeManager *, absl::Span types); +}; + } // namespace node } // namespace hybridse #endif // HYBRIDSE_INCLUDE_NODE_TYPE_NODE_H_ diff --git a/hybridse/include/plan/plan_api.h b/hybridse/include/plan/plan_api.h index 0ad45f91f9f..1e4f3b74845 100644 --- a/hybridse/include/plan/plan_api.h +++ b/hybridse/include/plan/plan_api.h @@ -15,9 +15,13 @@ */ #ifndef HYBRIDSE_INCLUDE_PLAN_PLAN_API_H_ #define HYBRIDSE_INCLUDE_PLAN_PLAN_API_H_ + #include #include + #include "node/node_manager.h" +#include "vm/sql_ctx.h" + namespace hybridse { namespace plan { @@ -27,6 +31,10 @@ using hybridse::node::NodePointVector; using hybridse::node::PlanNodeList; class PlanAPI { public: + // parse SQL string to logic plan. ASTNode and LogicNode saved in SqlContext + static base::Status CreatePlanTreeFromScript(vm::SqlContext* ctx); + + // deprecated, use CreatePlanTreeFromScript(vm::SqlContext*) instead static bool CreatePlanTreeFromScript(const std::string& sql, PlanNodeList& plan_trees, // NOLINT NodeManager* node_manager, @@ -34,6 +42,7 @@ class PlanAPI { bool is_batch_mode = true, bool is_cluster = false, bool enable_batch_window_parallelization = false, const std::unordered_map* extra_options = nullptr); + static const int GetPlanLimitCount(node::PlanNode* plan_trees); static const std::string GenerateName(const std::string prefix, int id); }; diff --git a/hybridse/include/vm/sql_ctx.h b/hybridse/include/vm/sql_ctx.h new file mode 100644 index 00000000000..25182b86647 --- /dev/null +++ b/hybridse/include/vm/sql_ctx.h @@ -0,0 +1,91 @@ +/** + * Copyright (c) 2023 OpenMLDB Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef HYBRIDSE_INCLUDE_VM_SQL_CTX_H_ +#define HYBRIDSE_INCLUDE_VM_SQL_CTX_H_ + +#include +#include +#include + +#include "node/node_manager.h" +#include "vm/engine_context.h" + +namespace zetasql { +class ParserOutput; +} + +namespace hybridse { +namespace vm { + +class HybridSeJitWrapper; +class ClusterJob; + +struct SqlContext { + // mode: batch|request|batch request + ::hybridse::vm::EngineMode engine_mode; + bool is_cluster_optimized = false; + bool is_batch_request_optimized = false; + bool enable_expr_optimize = false; + bool enable_batch_window_parallelization = true; + bool enable_window_column_pruning = false; + + // the sql content + std::string sql; + // the database + std::string db; + + std::unique_ptr ast_node; + // the logical plan + ::hybridse::node::PlanNodeList logical_plan; + ::hybridse::vm::PhysicalOpNode* physical_plan = nullptr; + + std::shared_ptr cluster_job; + // TODO(wangtaize) add a light jit engine + // eg using bthead to compile ir + hybridse::vm::JitOptions jit_options; + std::shared_ptr jit = nullptr; + Schema schema; + Schema request_schema; + std::string request_db_name; + std::string request_name; + Schema parameter_types; + uint32_t row_size; + uint32_t limit_cnt = 0; + std::string ir; + std::string logical_plan_str; + std::string physical_plan_str; + std::string encoded_schema; + std::string encoded_request_schema; + ::hybridse::node::NodeManager nm; + ::hybridse::udf::UdfLibrary* udf_library = nullptr; + + ::hybridse::vm::BatchRequestInfo batch_request_info; + + std::shared_ptr> options; + + // [ALPHA] SQL diagnostic infos + // not standardized, only index hints, no error, no warning, no other hint/info + std::shared_ptr index_hints; + + SqlContext(); + ~SqlContext(); +}; + +} // namespace vm +} // namespace hybridse + +#endif // HYBRIDSE_INCLUDE_VM_SQL_CTX_H_ diff --git a/hybridse/src/case/sql_case.cc b/hybridse/src/case/sql_case.cc index c98defb679b..be0633dc703 100644 --- a/hybridse/src/case/sql_case.cc +++ b/hybridse/src/case/sql_case.cc @@ -751,7 +751,7 @@ const std::string SqlCase::case_name() const { } bool SqlCase::ExtractInputTableDef(type::TableDef& table, int32_t input_idx) const { - if (inputs_.size() <= input_idx) { + if (inputs_.size() <= static_cast(input_idx)) { return false; } return ExtractInputTableDef(inputs_[input_idx], table); diff --git a/hybridse/src/codegen/aggregate_ir_builder.cc b/hybridse/src/codegen/aggregate_ir_builder.cc index 19e2a4f5cc3..22de3d3d742 100644 --- a/hybridse/src/codegen/aggregate_ir_builder.cc +++ b/hybridse/src/codegen/aggregate_ir_builder.cc @@ -21,10 +21,10 @@ #include #include +#include "codegen/buf_ir_builder.h" #include "codegen/expr_ir_builder.h" #include "codegen/ir_base_builder.h" #include "codegen/variable_ir_builder.h" -#include "gflags/gflags.h" #include "glog/logging.h" namespace hybridse { namespace codegen { diff --git a/hybridse/src/codegen/array_ir_builder.cc b/hybridse/src/codegen/array_ir_builder.cc index 5bf1bf06e99..5f3d22edc5c 100644 --- a/hybridse/src/codegen/array_ir_builder.cc +++ b/hybridse/src/codegen/array_ir_builder.cc @@ -17,26 +17,26 @@ #include "codegen/array_ir_builder.h" #include + +#include "codegen/context.h" #include "codegen/ir_base_builder.h" namespace hybridse { namespace codegen { +#define SZ_IDX 2 +#define RAW_IDX 0 +#define NULL_IDX 1 + ArrayIRBuilder::ArrayIRBuilder(::llvm::Module* m, llvm::Type* ele_ty) : StructTypeIRBuilder(m), element_type_(ele_ty) { InitStructType(); } -ArrayIRBuilder::ArrayIRBuilder(::llvm::Module* m, llvm::Type* ele_ty, llvm::Value* num_ele) - : StructTypeIRBuilder(m), element_type_(ele_ty), num_elements_(num_ele) { - InitStructType(); -} - void ArrayIRBuilder::InitStructType() { // name must unique between different array type std::string name = absl::StrCat("fe.array_", GetLlvmObjectString(element_type_)); - ::llvm::StringRef sr(name); - ::llvm::StructType* stype = m_->getTypeByName(sr); + ::llvm::StructType* stype = m_->getTypeByName(name); if (stype != NULL) { struct_type_ = stype; return; @@ -46,29 +46,36 @@ void ArrayIRBuilder::InitStructType() { ::llvm::Type* arr_type = element_type_->getPointerTo(); ::llvm::Type* nullable_type = ::llvm::IntegerType::getInt1Ty(m_->getContext())->getPointerTo(); ::llvm::Type* size_type = ::llvm::IntegerType::getInt64Ty(m_->getContext()); - std::vector<::llvm::Type*> elements = {arr_type, nullable_type, size_type}; - stype->setBody(::llvm::ArrayRef<::llvm::Type*>(elements)); + stype->setBody({arr_type, nullable_type, size_type}); struct_type_ = stype; } -base::Status ArrayIRBuilder::NewFixedArray(llvm::BasicBlock* bb, const std::vector& elements, - NativeValue* output) const { - // TODO(ace): reduce IR size with loop block - - CHECK_TRUE(num_elements_ != nullptr, common::kCodegenError, "num elements unknown"); - +absl::StatusOr ArrayIRBuilder::Construct(CodeGenContext* ctx, + absl::Span elements) const { + auto bb = ctx->GetCurrentBlock(); // alloc array struct llvm::Value* array_alloca = nullptr; - CHECK_TRUE(Create(bb, &array_alloca), common::kCodegenError, "can't create struct type for array"); + if (!Allocate(bb, &array_alloca)) { + return absl::InternalError("can't create struct type for array"); + } // ============================ // Init array elements // ============================ llvm::IRBuilder<> builder(bb); + auto num_elements = ctx->GetBuilder()->getInt64(elements.size()); + if (!Set(bb, array_alloca, SZ_IDX, num_elements)) { + return absl::InternalError("fail to set array size"); + } + + if (elements.empty()) { + // empty array + return NativeValue::Create(array_alloca); + } // init raw array and nullable array - auto* raw_array_ptr = builder.CreateAlloca(element_type_, num_elements_); - auto* nullables_ptr = builder.CreateAlloca(builder.getInt1Ty(), num_elements_); + auto* raw_array_ptr = builder.CreateAlloca(element_type_, num_elements); + auto* nullables_ptr = builder.CreateAlloca(builder.getInt1Ty(), num_elements); // fullfill the array struct auto* idx_val_ptr = builder.CreateAlloca(builder.getInt64Ty()); @@ -88,41 +95,26 @@ base::Status ArrayIRBuilder::NewFixedArray(llvm::BasicBlock* bb, const std::vect } // Set raw array - CHECK_TRUE(Set(bb, array_alloca, 0, raw_array_ptr), common::kCodegenError); + if (!Set(bb, array_alloca, RAW_IDX, raw_array_ptr)) { + return absl::InternalError("fail to set array values"); + } // Set nullable list - CHECK_TRUE(Set(bb, array_alloca, 1, nullables_ptr), common::kCodegenError); - - ::llvm::Value* array_sz = builder.CreateLoad(idx_val_ptr); - CHECK_TRUE(Set(bb, array_alloca, 2, array_sz), common::kCodegenError); - - *output = NativeValue::Create(array_alloca); - return base::Status::OK(); -} - - -base::Status ArrayIRBuilder::NewEmptyArray(llvm::BasicBlock* bb, NativeValue* output) const { - llvm::Value* array_alloca = nullptr; - CHECK_TRUE(Create(bb, &array_alloca), common::kCodegenError, "can't create struct type for array"); - - llvm::IRBuilder<> builder(bb); - - ::llvm::Value* array_sz = builder.getInt64(0); - CHECK_TRUE(Set(bb, array_alloca, 2, array_sz), common::kCodegenError); - - *output = NativeValue::Create(array_alloca); + if (!Set(bb, array_alloca, NULL_IDX, nullables_ptr)) { + return absl::InternalError("fail to set array nulls"); + } - return base::Status::OK(); + return NativeValue::Create(array_alloca); } bool ArrayIRBuilder::CreateDefault(::llvm::BasicBlock* block, ::llvm::Value** output) { llvm::Value* array_alloca = nullptr; - if (!Create(block, &array_alloca)) { + if (!Allocate(block, &array_alloca)) { return false; } llvm::IRBuilder<> builder(block); ::llvm::Value* array_sz = builder.getInt64(0); - if (!Set(block, array_alloca, 2, array_sz)) { + if (!Set(block, array_alloca, SZ_IDX, array_sz)) { return false; } diff --git a/hybridse/src/codegen/array_ir_builder.h b/hybridse/src/codegen/array_ir_builder.h index 66ef2fe05da..b6ff275ac45 100644 --- a/hybridse/src/codegen/array_ir_builder.h +++ b/hybridse/src/codegen/array_ir_builder.h @@ -17,9 +17,6 @@ #ifndef HYBRIDSE_SRC_CODEGEN_ARRAY_IR_BUILDER_H_ #define HYBRIDSE_SRC_CODEGEN_ARRAY_IR_BUILDER_H_ -#include - -#include "absl/base/attributes.h" #include "codegen/struct_ir_builder.h" namespace hybridse { @@ -29,27 +26,15 @@ namespace codegen { // - Array of raw values: T* // - Array of nullable values: bool* // - array size: int64 -class ArrayIRBuilder : public StructTypeIRBuilder { +class ArrayIRBuilder : public StructTypeIRBuilder { public: // Array builder with num elements unknown ArrayIRBuilder(::llvm::Module* m, llvm::Type* ele_ty); - // Array builder with num elements known at some point - ArrayIRBuilder(::llvm::Module* m, llvm::Type* ele_ty, llvm::Value* num_ele); - ~ArrayIRBuilder() override {} // create a new array from `elements` as value - ABSL_MUST_USE_RESULT - base::Status NewFixedArray(llvm::BasicBlock* bb, const std::vector& elements, - NativeValue* output) const; - - ABSL_MUST_USE_RESULT - base::Status NewEmptyArray(llvm::BasicBlock* bb, NativeValue* output) const; - - void InitStructType() override; - - bool CreateDefault(::llvm::BasicBlock* block, ::llvm::Value** output) override; + absl::StatusOr Construct(CodeGenContext* ctx, absl::Span args) const override; bool CopyFrom(::llvm::BasicBlock* block, ::llvm::Value* src, ::llvm::Value* dist) override { return true; } @@ -57,9 +42,13 @@ class ArrayIRBuilder : public StructTypeIRBuilder { CHECK_TRUE(false, common::kCodegenError, "casting to array un-implemented"); }; + private: + void InitStructType() override; + + bool CreateDefault(::llvm::BasicBlock* block, ::llvm::Value** output) override; + private: ::llvm::Type* element_type_ = nullptr; - ::llvm::Value* num_elements_ = nullptr; }; } // namespace codegen diff --git a/hybridse/src/codegen/block_ir_builder.cc b/hybridse/src/codegen/block_ir_builder.cc index 6f53e80aa40..818229553ca 100644 --- a/hybridse/src/codegen/block_ir_builder.cc +++ b/hybridse/src/codegen/block_ir_builder.cc @@ -15,15 +15,15 @@ */ #include "codegen/block_ir_builder.h" + #include "codegen/context.h" #include "codegen/expr_ir_builder.h" +#include "codegen/ir_base_builder.h" #include "codegen/list_ir_builder.h" #include "codegen/struct_ir_builder.h" #include "codegen/type_ir_builder.h" #include "codegen/variable_ir_builder.h" #include "glog/logging.h" -#include "llvm/ADT/ArrayRef.h" -#include "llvm/IR/CFG.h" #include "llvm/IR/IRBuilder.h" using ::hybridse::common::kCodegenError; diff --git a/hybridse/src/codegen/date_ir_builder.cc b/hybridse/src/codegen/date_ir_builder.cc index 19bf319d7c3..1bfb1d31160 100644 --- a/hybridse/src/codegen/date_ir_builder.cc +++ b/hybridse/src/codegen/date_ir_builder.cc @@ -55,7 +55,7 @@ bool DateIRBuilder::NewDate(::llvm::BasicBlock* block, ::llvm::Value** output) { return false; } ::llvm::Value* date; - if (!Create(block, &date)) { + if (!Allocate(block, &date)) { return false; } if (!SetDate(block, date, @@ -73,7 +73,7 @@ bool DateIRBuilder::NewDate(::llvm::BasicBlock* block, ::llvm::Value* days, return false; } ::llvm::Value* date; - if (!Create(block, &date)) { + if (!Allocate(block, &date)) { return false; } if (!SetDate(block, date, days)) { diff --git a/hybridse/src/codegen/date_ir_builder.h b/hybridse/src/codegen/date_ir_builder.h index d9004d48da1..1d51cc98ceb 100644 --- a/hybridse/src/codegen/date_ir_builder.h +++ b/hybridse/src/codegen/date_ir_builder.h @@ -28,8 +28,6 @@ class DateIRBuilder : public StructTypeIRBuilder { explicit DateIRBuilder(::llvm::Module* m); ~DateIRBuilder(); - void InitStructType() override; - bool CreateDefault(::llvm::BasicBlock* block, ::llvm::Value** output) override; bool CopyFrom(::llvm::BasicBlock* block, ::llvm::Value* src, ::llvm::Value* dist) override; base::Status CastFrom(::llvm::BasicBlock* block, const NativeValue& src, NativeValue* output) override; @@ -46,6 +44,9 @@ class DateIRBuilder : public StructTypeIRBuilder { ::llvm::Value** output, base::Status& status); // NOLINT bool Year(::llvm::BasicBlock* block, ::llvm::Value* date, ::llvm::Value** output, base::Status& status); // NOLINT + private: + void InitStructType() override; + bool CreateDefault(::llvm::BasicBlock* block, ::llvm::Value** output) override; }; } // namespace codegen } // namespace hybridse diff --git a/hybridse/src/codegen/expr_ir_builder.cc b/hybridse/src/codegen/expr_ir_builder.cc index 6b95bfb8ce1..ccf3838cbcf 100644 --- a/hybridse/src/codegen/expr_ir_builder.cc +++ b/hybridse/src/codegen/expr_ir_builder.cc @@ -19,8 +19,10 @@ #include #include #include +#include #include "base/numeric.h" +#include "codegen/arithmetic_expr_ir_builder.h" #include "codegen/array_ir_builder.h" #include "codegen/buf_ir_builder.h" #include "codegen/cond_select_ir_builder.h" @@ -28,11 +30,19 @@ #include "codegen/date_ir_builder.h" #include "codegen/ir_base_builder.h" #include "codegen/list_ir_builder.h" +#include "codegen/map_ir_builder.h" +#include "codegen/predicate_expr_ir_builder.h" +#include "codegen/scope_var.h" #include "codegen/timestamp_ir_builder.h" #include "codegen/type_ir_builder.h" #include "codegen/udf_ir_builder.h" +#include "codegen/variable_ir_builder.h" #include "codegen/window_ir_builder.h" #include "glog/logging.h" +#include "llvm/IR/IRBuilder.h" +#include "node/node_manager.h" +#include "node/type_node.h" +#include "passes/resolve_fn_and_attrs.h" #include "proto/fe_common.pb.h" #include "udf/default_udf_library.h" #include "vm/schemas_context.h" @@ -199,6 +209,10 @@ Status ExprIRBuilder::Build(const ::hybridse::node::ExprNode* node, CHECK_STATUS(BuildArrayExpr(dynamic_cast(node), output)); break; } + case ::hybridse::node::kExprArrayElement: { + CHECK_STATUS(BuildArrayElement(dynamic_cast(node), output)); + break; + } default: { return Status(kCodegenError, "Expression Type " + @@ -1157,13 +1171,6 @@ Status ExprIRBuilder::BuildArrayExpr(const ::hybridse::node::ArrayExpr* node, Na llvm::IRBuilder<> builder(ctx_->GetCurrentBlock()); - if (node->GetChildNum() == 0) { - // build empty array - ArrayIRBuilder ir_builder(ctx_->GetModule(), ele_type); - CHECK_STATUS(ir_builder.NewEmptyArray(ctx_->GetCurrentBlock(), output)); - return Status::OK(); - } - CastExprIRBuilder cast_builder(ctx_->GetCurrentBlock()); std::vector elements; for (auto& ele : node->children_) { @@ -1178,11 +1185,46 @@ Status ExprIRBuilder::BuildArrayExpr(const ::hybridse::node::ArrayExpr* node, Na } } - ::llvm::Value* num_elements = builder.getInt64(elements.size()); - ArrayIRBuilder array_builder(ctx_->GetModule(), ele_type, num_elements); - CHECK_STATUS(array_builder.NewFixedArray(ctx_->GetCurrentBlock(), elements, output)); + ArrayIRBuilder array_builder(ctx_->GetModule(), ele_type); + auto rs = array_builder.Construct(ctx_, elements); + if (!rs.ok()) { + FAIL_STATUS(kCodegenError, rs.status()); + } + + *output = rs.value(); return Status::OK(); } +Status ExprIRBuilder::BuildArrayElement(const ::hybridse::node::ArrayElementExpr* expr, NativeValue* output) { + auto* arr_type = expr->array()->GetOutputType(); + NativeValue arr_val; + CHECK_STATUS(Build(expr->array(), &arr_val)); + + NativeValue pos_val; + CHECK_STATUS(Build(expr->position(), &pos_val)); + + std::unique_ptr type_builder; + + if (arr_type->IsMap()) { + auto* map_type = arr_type->GetAsOrNull(); + ::llvm::Type* key_type = nullptr; + ::llvm::Type* value_type = nullptr; + CHECK_TRUE(GetLlvmType(ctx_->GetModule(), map_type->key_type(), &key_type), kCodegenError); + CHECK_TRUE(GetLlvmType(ctx_->GetModule(), map_type->value_type(), &value_type), kCodegenError); + type_builder.reset(new MapIRBuilder(ctx_->GetModule(), key_type, value_type)); + } else if (arr_type->IsArray()) { + ::llvm::Type* ele_type = nullptr; + CHECK_TRUE(GetLlvmType(ctx_->GetModule(), arr_type->GetGenericType(0), &ele_type), kCodegenError); + type_builder.reset(new ArrayIRBuilder(ctx_->GetModule(), ele_type)); + } else { + return {common::kCodegenError, absl::StrCat("can't get element from type ", arr_type->DebugString())}; + } + + auto res = type_builder->ExtractElement(ctx_, arr_val, pos_val); + CHECK_TRUE(res.ok(), common::kCodegenError, res.status().ToString()); + *output = res.value(); + + return {}; +} } // namespace codegen } // namespace hybridse diff --git a/hybridse/src/codegen/expr_ir_builder.h b/hybridse/src/codegen/expr_ir_builder.h index 6838d96a88b..051c9a32bfd 100644 --- a/hybridse/src/codegen/expr_ir_builder.h +++ b/hybridse/src/codegen/expr_ir_builder.h @@ -17,24 +17,12 @@ #ifndef HYBRIDSE_SRC_CODEGEN_EXPR_IR_BUILDER_H_ #define HYBRIDSE_SRC_CODEGEN_EXPR_IR_BUILDER_H_ -#include -#include #include #include + #include "base/fe_status.h" -#include "codegen/arithmetic_expr_ir_builder.h" -#include "codegen/buf_ir_builder.h" -#include "codegen/predicate_expr_ir_builder.h" -#include "codegen/row_ir_builder.h" -#include "codegen/scope_var.h" -#include "codegen/variable_ir_builder.h" -#include "codegen/window_ir_builder.h" -#include "llvm/IR/IRBuilder.h" -#include "node/node_manager.h" +#include "codegen/context.h" #include "node/sql_node.h" -#include "node/type_node.h" -#include "passes/resolve_fn_and_attrs.h" -#include "vm/schemas_context.h" namespace hybridse { namespace codegen { @@ -117,6 +105,8 @@ class ExprIRBuilder { Status BuildArrayExpr(const ::hybridse::node::ArrayExpr* node, NativeValue* output); + Status BuildArrayElement(const ::hybridse::node::ArrayElementExpr*, NativeValue*); + private: CodeGenContext* ctx_; diff --git a/hybridse/src/codegen/fn_let_ir_builder.cc b/hybridse/src/codegen/fn_let_ir_builder.cc index 362e4a83df6..6d8e86e3933 100644 --- a/hybridse/src/codegen/fn_let_ir_builder.cc +++ b/hybridse/src/codegen/fn_let_ir_builder.cc @@ -15,13 +15,14 @@ */ #include "codegen/fn_let_ir_builder.h" + #include "codegen/aggregate_ir_builder.h" +#include "codegen/buf_ir_builder.h" #include "codegen/context.h" #include "codegen/expr_ir_builder.h" #include "codegen/ir_base_builder.h" #include "codegen/variable_ir_builder.h" #include "glog/logging.h" -#include "vm/transform.h" using ::hybridse::common::kCodegenError; diff --git a/hybridse/src/codegen/ir_base_builder.cc b/hybridse/src/codegen/ir_base_builder.cc index 992d41d0998..81fadbfdd3d 100644 --- a/hybridse/src/codegen/ir_base_builder.cc +++ b/hybridse/src/codegen/ir_base_builder.cc @@ -556,7 +556,24 @@ bool GetFullType(node::NodeManager* nm, ::llvm::Type* type, return false; } case hybridse::node::kMap: { - LOG(WARNING) << "fail to get type for map"; + if (type->isPointerTy()) { + auto type_pointee = type->getPointerElementType(); + if (type_pointee->isStructTy()) { + auto* key_type = type_pointee->getStructElementType(1); + const node::TypeNode* key = nullptr; + if (key_type->isPointerTy() && !GetFullType(nm, key_type->getPointerElementType(), &key)) { + return false; + } + const node::TypeNode* value = nullptr; + auto* value_type = type_pointee->getStructElementType(2); + if (value_type->isPointerTy() && !GetFullType(nm, value_type->getPointerElementType(), &value)) { + return false; + } + + *type_node = nm->MakeNode(key, value); + return true; + } + } return false; } default: { @@ -643,6 +660,9 @@ bool GetBaseType(::llvm::Type* type, ::hybridse::node::DataType* output) { } else if (struct_name.startswith("fe.array_")) { *output = hybridse::node::kArray; return true; + } else if (struct_name.startswith("fe.map_")) { + *output = hybridse::node::kMap; + return true; } LOG(WARNING) << "no mapping pointee_ty for llvm pointee_ty " << pointee_ty->getStructName().str(); diff --git a/hybridse/src/codegen/ir_base_builder_test.h b/hybridse/src/codegen/ir_base_builder_test.h index 478d8ae5ea3..af29e4fd56c 100644 --- a/hybridse/src/codegen/ir_base_builder_test.h +++ b/hybridse/src/codegen/ir_base_builder_test.h @@ -22,8 +22,8 @@ #include #include +#include "codegen/ir_base_builder.h" #include "llvm/IR/Verifier.h" -#include "llvm/Support/InitLLVM.h" #include "llvm/Support/TargetSelect.h" #include "base/fe_status.h" @@ -34,8 +34,7 @@ #include "passes/resolve_fn_and_attrs.h" #include "udf/default_udf_library.h" #include "udf/literal_traits.h" -#include "udf/udf.h" -#include "vm/sql_compiler.h" +#include "vm/jit_wrapper.h" namespace hybridse { namespace codegen { @@ -360,8 +359,7 @@ void ModuleFunctionBuilderWithFullInfo::ExpandApplyArg( ::llvm::Value* alloca; if (TypeIRBuilder::IsStructPtr(expect_ty)) { auto struct_builder = - StructTypeIRBuilder::CreateStructTypeIRBuilder( - function->getEntryBlock().getModule(), expect_ty); + StructTypeIRBuilder::CreateStructTypeIRBuilder(function->getEntryBlock().getModule(), expect_ty); struct_builder->CreateDefault(&function->getEntryBlock(), &alloca); arg = builder.CreateSelect( diff --git a/hybridse/src/codegen/map_ir_builder.cc b/hybridse/src/codegen/map_ir_builder.cc new file mode 100644 index 00000000000..8945c88f9b7 --- /dev/null +++ b/hybridse/src/codegen/map_ir_builder.cc @@ -0,0 +1,326 @@ +/* + * Copyright 2022 OpenMLDB authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "codegen/map_ir_builder.h" + +#include + +#include "absl/status/status.h" +#include "codegen/array_ir_builder.h" +#include "codegen/cast_expr_ir_builder.h" +#include "codegen/context.h" +#include "codegen/ir_base_builder.h" +#include "codegen/cond_select_ir_builder.h" +#include "codegen/predicate_expr_ir_builder.h" + +namespace hybridse { +namespace codegen { + +static const char* PREFIX = "fe.map"; +#define SZ_IDX 0 +#define KEY_VEC_IDX 1 +#define VALUE_VEC_IDX 2 +#define VALUE_NULL_VEC_IDX 3 + +MapIRBuilder::MapIRBuilder(::llvm::Module* m, ::llvm::Type* key_ty, ::llvm::Type* value_ty) + : StructTypeIRBuilder(m), key_type_(key_ty), value_type_(value_ty) { + InitStructType(); +} + +void MapIRBuilder::InitStructType() { + std::string name = + absl::StrCat(PREFIX, "__", GetLlvmObjectString(key_type_), "_", GetLlvmObjectString(value_type_), "__"); + ::llvm::StringRef sr(name); + ::llvm::StructType* stype = m_->getTypeByName(sr); + if (stype != NULL) { + struct_type_ = stype; + return; + } + stype = ::llvm::StructType::create(m_->getContext(), name); + + // %map__{key}_{value}__ = { size, vec, vec, vec } + ::llvm::Type* size_type = ::llvm::IntegerType::getInt64Ty(m_->getContext()); + // ::llvm::Type* key_vec = ::llvm::VectorType::get(key_type_, {MIN_VEC_SIZE, true}); + LOG(INFO) << "key vec is " << GetLlvmObjectString(key_type_); + ::llvm::Type* key_vec = key_type_->getPointerTo(); + ::llvm::Type* value_vec = value_type_->getPointerTo(); + ::llvm::Type* value_null_type = ::llvm::IntegerType::getInt1Ty(m_->getContext())->getPointerTo(); + stype->setBody({size_type, key_vec, value_vec, value_null_type}); + struct_type_ = stype; +} + +absl::StatusOr MapIRBuilder::Construct(CodeGenContext* ctx, absl::Span args) const { + EnsureOK(); + + ::llvm::Value* map_alloca = nullptr; + if (!Allocate(ctx->GetCurrentBlock(), &map_alloca)) { + return absl::FailedPreconditionError(absl::StrCat("unable to allocate ", GetLlvmObjectString(struct_type_))); + } + + auto builder = ctx->GetBuilder(); + auto* original_size = builder->getInt64(args.size() / 2); + auto* key_vec = builder->CreateAlloca(key_type_, original_size, "key_vec"); + auto* value_vec = builder->CreateAlloca(value_type_, original_size, "value_vec"); + auto* value_nulls_vec = builder->CreateAlloca(builder->getInt1Ty(), original_size, "value_nulls_vec"); + + // creating raw values for map + + CastExprIRBuilder cast_builder(ctx->GetCurrentBlock()); + + // original vector, may contains duplicate keys + auto* original_keys = builder->CreateAlloca(key_type_, original_size, "original_keys"); + auto* original_keys_is_null = builder->CreateAlloca(builder->getInt1Ty(), original_size, "original_keys_is_null"); + auto* original_values = builder->CreateAlloca(value_type_, original_size, "original_values"); + auto* original_values_is_null = + builder->CreateAlloca(builder->getInt1Ty(), original_size, "original_values_is_null"); + for (size_t i = 0; i < args.size(); i += 2) { + auto* update_idx = builder->getInt64(i / 2); + NativeValue key = args[i]; + if (key.GetValue(builder)->getType() != key_type_) { + auto s = cast_builder.Cast(key, key_type_, &key); + if (!s.isOK()) { + return absl::InternalError(absl::StrCat("fail to case map key: ", s.str())); + } + } + NativeValue value = args[i + 1]; + if (value.GetValue(builder)->getType() != value_type_) { + auto s = cast_builder.Cast(value, value_type_, &value); + if (!s.isOK()) { + return absl::InternalError(absl::StrCat("fail to case map value: ", s.str())); + } + } + builder->CreateStore(key.GetIsNull(ctx), builder->CreateGEP(original_keys_is_null, update_idx)); + builder->CreateStore(key.GetValue(ctx), builder->CreateGEP(original_keys, update_idx)); + builder->CreateStore(value.GetIsNull(ctx), builder->CreateGEP(original_values_is_null, update_idx)); + builder->CreateStore(value.GetValue(ctx), builder->CreateGEP(original_values, update_idx)); + } + + ::llvm::Value* update_idx_ptr = builder->CreateAlloca(builder->getInt64Ty(), nullptr, "update_idx"); + builder->CreateStore(builder->getInt64(0), update_idx_ptr); + ::llvm::Value* true_idx_ptr = builder->CreateAlloca(builder->getInt64Ty(), nullptr, "true_idx"); + builder->CreateStore(builder->getInt64(0), true_idx_ptr); + + auto s = ctx->CreateWhile( + [&](llvm::Value** cond) -> base::Status { + *cond = builder->CreateAnd( + builder->CreateICmpSLT(builder->CreateLoad(update_idx_ptr), original_size, "if_while_true"), + builder->CreateICmpSLT(builder->CreateLoad(true_idx_ptr), original_size)); + return {}; + }, + [&]() -> base::Status { + auto idx = builder->CreateLoad(update_idx_ptr, "update_idx_value"); + auto true_idx = builder->CreateLoad(true_idx_ptr, "true_idx_value"); + CHECK_STATUS(ctx->CreateBranchNot( + builder->CreateLoad(builder->CreateGEP(original_keys_is_null, idx)), [&]() -> base::Status { + // write to map if key is not null + builder->CreateStore(builder->CreateLoad(builder->CreateGEP(original_keys, idx)), + builder->CreateGEP(key_vec, true_idx)); + builder->CreateStore(builder->CreateLoad(builder->CreateGEP(original_values, idx)), + builder->CreateGEP(value_vec, true_idx)); + builder->CreateStore(builder->CreateLoad(builder->CreateGEP(original_values_is_null, idx)), + builder->CreateGEP(value_nulls_vec, true_idx)); + + builder->CreateStore(builder->CreateAdd(builder->getInt64(1), true_idx), true_idx_ptr); + return {}; + })); + + builder->CreateStore(builder->CreateAdd(builder->getInt64(1), idx), update_idx_ptr); + return {}; + }); + if (!s.isOK()) { + return absl::InternalError(s.str()); + } + + auto* final_size = builder->CreateLoad(true_idx_ptr, "true_size"); + auto as = Set(ctx, map_alloca, {final_size, key_vec, value_vec, value_nulls_vec}); + + if (!as.ok()) { + return as; + } + + return NativeValue::Create(map_alloca); +} + +bool MapIRBuilder::CreateDefault(::llvm::BasicBlock* block, ::llvm::Value** output) { + llvm::Value* map_alloca = nullptr; + if (!Allocate(block, &map_alloca)) { + return false; + } + + llvm::IRBuilder<> builder(block); + ::llvm::Value* size = builder.getInt64(0); + if (!Set(block, map_alloca, SZ_IDX, size)) { + return false; + } + + *output = map_alloca; + return true; +} + +absl::StatusOr MapIRBuilder::ExtractElement(CodeGenContext* ctx, const NativeValue& arr, + const NativeValue& key) const { + EnsureOK(); + + auto builder = ctx->GetBuilder(); + auto* arr_is_null = arr.GetIsNull(ctx); + auto* key_is_null = key.GetIsNull(ctx); + + auto* out_val_alloca = builder->CreateAlloca(value_type_); + builder->CreateStore(::llvm::UndefValue::get(value_type_), out_val_alloca); + auto* out_null_alloca = builder->CreateAlloca(builder->getInt1Ty()); + builder->CreateStore(builder->getInt1(true), out_null_alloca); + + auto s = ctx->CreateBranch( + builder->CreateOr(arr_is_null, key_is_null), + [&]() -> base::Status { + return {}; + }, + [&]() -> base::Status { + NativeValue casted_key = key; + if (key.GetType() != key_type_) { + CastExprIRBuilder cast_builder(ctx->GetCurrentBlock()); + CHECK_STATUS(cast_builder.Cast(key, key_type_, &casted_key)); + } + auto* key_val = casted_key.GetValue(ctx); + + auto* map_ptr = arr.GetValue(ctx); + ::llvm::Value* sz = nullptr; + CHECK_TRUE(Load(ctx->GetCurrentBlock(), map_ptr, SZ_IDX, &sz), common::kCodegenError); + + ::llvm::Value* keys = nullptr; + CHECK_TRUE(Load(ctx->GetCurrentBlock(), map_ptr, KEY_VEC_IDX, &keys), common::kCodegenError); + + ::llvm::Value* idx_alloc = builder->CreateAlloca(builder->getInt64Ty()); + builder->CreateStore(builder->getInt64(0), idx_alloc); + ::llvm::Value* found_idx_alloc = builder->CreateAlloca(builder->getInt64Ty()); + builder->CreateStore(builder->getInt64(-1), found_idx_alloc); + + CHECK_STATUS(ctx->CreateWhile( + [&](::llvm::Value** cond) -> base::Status { + ::llvm::Value* idx = builder->CreateLoad(idx_alloc); + ::llvm::Value* found = builder->CreateLoad(found_idx_alloc); + *cond = builder->CreateAnd(builder->CreateICmpSLT(idx, sz), + builder->CreateICmpSLT(found, builder->getInt64(0))); + return {}; + }, + [&]() -> base::Status { + ::llvm::Value* idx = builder->CreateLoad(idx_alloc); + // key never null + auto* ele = builder->CreateLoad(builder->CreateGEP(keys, idx)); + ::llvm::Value* eq = nullptr; + base::Status s; + PredicateIRBuilder::BuildEqExpr(ctx->GetCurrentBlock(), ele, key_val, &eq, s); + CHECK_STATUS(s); + + ::llvm::Value* update_found_idx = builder->CreateSelect(eq, idx, builder->getInt64(-1)); + + builder->CreateStore(update_found_idx, found_idx_alloc); + builder->CreateStore(builder->CreateAdd(idx, builder->getInt64(1)), idx_alloc); + return {}; + })); + + auto* found_idx = builder->CreateLoad(found_idx_alloc); + + CHECK_STATUS(ctx->CreateBranch( + builder->CreateAnd(builder->CreateICmpSLT(found_idx, sz), + builder->CreateICmpSGE(found_idx, builder->getInt64(0))), + [&]() -> base::Status { + ::llvm::Value* values = nullptr; + CHECK_TRUE(Load(ctx->GetCurrentBlock(), map_ptr, VALUE_VEC_IDX, &values), common::kCodegenError); + + ::llvm::Value* value_nulls = nullptr; + CHECK_TRUE(Load(ctx->GetCurrentBlock(), map_ptr, VALUE_NULL_VEC_IDX, &value_nulls), + common::kCodegenError); + + auto* val = builder->CreateLoad(builder->CreateGEP(values, found_idx)); + auto* val_nullable = builder->CreateLoad(builder->CreateGEP(value_nulls, found_idx)); + + builder->CreateStore(val, out_val_alloca); + builder->CreateStore(val_nullable, out_null_alloca); + return {}; + }, + [&]() -> base::Status { return {}; })); + + return {}; + }); + + if (!s.isOK()) { + return absl::InvalidArgumentError(s.str()); + } + + auto* out_val = builder->CreateLoad(out_val_alloca); + auto* out_null_val = builder->CreateLoad(out_null_alloca); + + return NativeValue::CreateWithFlag(out_val, out_null_val); +} + +absl::StatusOr MapIRBuilder::MapKeys(CodeGenContext* ctx, const NativeValue& in) const { + EnsureOK(); + + auto map_is_null = in.GetIsNull(ctx); + auto map_ptr = in.GetValue(ctx); + + auto builder = ctx->GetBuilder(); + ::llvm::Value* keys_ptr = nullptr; + if (!Load(ctx->GetCurrentBlock(), map_ptr, KEY_VEC_IDX, &keys_ptr)) { + return absl::FailedPreconditionError("failed to extract map keys"); + } + if (!keys_ptr->getType()->isPointerTy()) { + return absl::FailedPreconditionError("map keys entry is not pointer"); + } + ::llvm::Value* size = nullptr; + if (!Load(ctx->GetCurrentBlock(), map_ptr, SZ_IDX, &size)) { + return absl::FailedPreconditionError("failed to extract map size"); + } + + // construct nulls as [false ...] + auto nulls = builder->CreateAlloca(builder->getInt1Ty(), size); + auto idx_ptr = builder->CreateAlloca(builder->getInt64Ty()); + builder->CreateStore(builder->getInt64(0), idx_ptr); + ctx->CreateWhile( + [&](::llvm::Value** cond) -> base::Status { + *cond = builder->CreateICmpSLT(builder->CreateLoad(idx_ptr), size); + return {}; + }, + [&]() -> base::Status { + auto idx = builder->CreateLoad(idx_ptr); + + builder->CreateStore(builder->getInt1(false), builder->CreateGEP(nulls, idx)); + + builder->CreateStore(builder->CreateAdd(idx, builder->getInt64(1)), idx_ptr); + return {}; + }); + + ArrayIRBuilder array_builder(ctx->GetModule(), keys_ptr->getType()->getPointerElementType()); + auto rs = array_builder.ConstructFromRaw(ctx, {keys_ptr, nulls, size}); + + if (!rs.ok()) { + return rs.status(); + } + + NativeValue out; + CondSelectIRBuilder cond_builder; + auto s = cond_builder.Select(ctx->GetCurrentBlock(), NativeValue::Create(map_is_null), + NativeValue::CreateNull(array_builder.GetType()), NativeValue::Create(rs.value()), &out); + + if (!s.isOK()) { + return absl::FailedPreconditionError(s.str()); + } + + return out; +} +} // namespace codegen +} // namespace hybridse diff --git a/hybridse/src/codegen/map_ir_builder.h b/hybridse/src/codegen/map_ir_builder.h new file mode 100644 index 00000000000..478c6cc975b --- /dev/null +++ b/hybridse/src/codegen/map_ir_builder.h @@ -0,0 +1,55 @@ +/* + * Copyright 2022 OpenMLDB authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef HYBRIDSE_SRC_CODEGEN_MAP_IR_BUILDER_H_ +#define HYBRIDSE_SRC_CODEGEN_MAP_IR_BUILDER_H_ + +#include "codegen/struct_ir_builder.h" + +namespace hybridse { +namespace codegen { + +class MapIRBuilder final : public StructTypeIRBuilder { + public: + MapIRBuilder(::llvm::Module* m, ::llvm::Type* key_ty, ::llvm::Type* value_ty); + ~MapIRBuilder() override {} + + absl::StatusOr Construct(CodeGenContext* ctx, absl::Span args) const override; + + bool CopyFrom(::llvm::BasicBlock* block, ::llvm::Value* src, ::llvm::Value* dist) override { return true; } + base::Status CastFrom(::llvm::BasicBlock* block, const NativeValue& src, NativeValue* output) override { + return {}; + } + + absl::StatusOr ExtractElement(CodeGenContext* ctx, const NativeValue&, + const NativeValue&) const override; + + absl::StatusOr MapKeys(CodeGenContext*, const NativeValue&) const; + + private: + void InitStructType() override; + + bool CreateDefault(::llvm::BasicBlock* block, ::llvm::Value** output) override; + + private: + ::llvm::Type* key_type_ = nullptr; + ::llvm::Type* value_type_ = nullptr; +}; + +} // namespace codegen +} // namespace hybridse + +#endif // HYBRIDSE_SRC_CODEGEN_MAP_IR_BUILDER_H_ diff --git a/hybridse/src/codegen/string_ir_builder.cc b/hybridse/src/codegen/string_ir_builder.cc index 8c41d326ee0..083c907fbe4 100644 --- a/hybridse/src/codegen/string_ir_builder.cc +++ b/hybridse/src/codegen/string_ir_builder.cc @@ -66,7 +66,7 @@ bool StringIRBuilder::CreateDefault(::llvm::BasicBlock* block, bool StringIRBuilder::NewString(::llvm::BasicBlock* block, ::llvm::Value** output) { - if (!Create(block, output)) { + if (!Allocate(block, output)) { LOG(WARNING) << "Fail to Create Default String"; return false; } @@ -86,7 +86,7 @@ bool StringIRBuilder::NewString(::llvm::BasicBlock* block, } bool StringIRBuilder::NewString(::llvm::BasicBlock* block, ::llvm::Value* size, ::llvm::Value* data, ::llvm::Value** output) { - if (!Create(block, output)) { + if (!Allocate(block, output)) { LOG(WARNING) << "Fail to Create Default String"; return false; } diff --git a/hybridse/src/codegen/struct_ir_builder.cc b/hybridse/src/codegen/struct_ir_builder.cc index 7adfb5d950f..4b0be401065 100644 --- a/hybridse/src/codegen/struct_ir_builder.cc +++ b/hybridse/src/codegen/struct_ir_builder.cc @@ -15,10 +15,15 @@ */ #include "codegen/struct_ir_builder.h" + +#include "absl/status/status.h" +#include "absl/strings/substitute.h" +#include "codegen/context.h" #include "codegen/date_ir_builder.h" #include "codegen/ir_base_builder.h" #include "codegen/string_ir_builder.h" #include "codegen/timestamp_ir_builder.h" + namespace hybridse { namespace codegen { StructTypeIRBuilder::StructTypeIRBuilder(::llvm::Module* m) @@ -54,6 +59,8 @@ StructTypeIRBuilder* StructTypeIRBuilder::CreateStructTypeIRBuilder(::llvm::Modu } absl::StatusOr StructTypeIRBuilder::CreateNull(::llvm::BasicBlock* block) { + EnsureOK(); + ::llvm::Value* value = nullptr; if (!CreateDefault(block, &value)) { return absl::InternalError(absl::StrCat("fail to construct ", GetLlvmObjectString(GetType()))); @@ -62,16 +69,17 @@ absl::StatusOr StructTypeIRBuilder::CreateNull(::llvm::BasicBlock* return NativeValue::CreateWithFlag(value, builder.getInt1(true)); } -::llvm::Type* StructTypeIRBuilder::GetType() { return struct_type_; } +::llvm::Type* StructTypeIRBuilder::GetType() const { return struct_type_; } -bool StructTypeIRBuilder::Create(::llvm::BasicBlock* block, +bool StructTypeIRBuilder::Allocate(::llvm::BasicBlock* block, ::llvm::Value** output) const { if (block == NULL || output == NULL) { LOG(WARNING) << "the output ptr or block is NULL "; return false; } ::llvm::IRBuilder<> builder(block); - ::llvm::Value* value = CreateAllocaAtHead(&builder, struct_type_, "struct_alloca"); + // value is a pointer to struct type + ::llvm::Value* value = CreateAllocaAtHead(&builder, struct_type_, GetLlvmObjectString(struct_type_)); *output = value; return true; } @@ -96,22 +104,10 @@ bool StructTypeIRBuilder::Set(::llvm::BasicBlock* block, ::llvm::Value* struct_v LOG(WARNING) << "Fail set Struct value: struct pointer is required"; return false; } - if (struct_value->getType()->getPointerElementType() != struct_type_) { - LOG(WARNING) << "Fail set Struct value: struct value type invalid " - << struct_value->getType() - ->getPointerElementType() - ->getStructName() - .str(); - return false; - } + ::llvm::IRBuilder<> builder(block); - builder.getInt64(1); - ::llvm::Value* value_ptr = - builder.CreateStructGEP(struct_type_, struct_value, idx); - if (nullptr == builder.CreateStore(value, value_ptr)) { - LOG(WARNING) << "Fail Set Struct Value idx = " << idx; - return false; - } + ::llvm::Value* value_ptr = builder.CreateStructGEP(struct_type_, struct_value, idx); + builder.CreateStore(value, value_ptr); return true; } @@ -137,5 +133,77 @@ bool StructTypeIRBuilder::Get(::llvm::BasicBlock* block, ::llvm::Value* struct_v *output = builder.CreateStructGEP(struct_type_, struct_value, idx); return true; } +absl::StatusOr StructTypeIRBuilder::Construct(CodeGenContext* ctx, + absl::Span args) const { + return absl::UnimplementedError(absl::StrCat("Construct for type ", GetLlvmObjectString(struct_type_))); +} + +absl::StatusOr<::llvm::Value*> StructTypeIRBuilder::ConstructFromRaw(CodeGenContext* ctx, + absl::Span<::llvm::Value* const> args) const { + EnsureOK(); + + llvm::Value* alloca = nullptr; + if (!Allocate(ctx->GetCurrentBlock(), &alloca)) { + return absl::FailedPreconditionError("failed to allocate array"); + } + + auto s = Set(ctx, alloca, args); + if (!s.ok()) { + return s; + } + + return alloca; +} + +absl::StatusOr StructTypeIRBuilder::ExtractElement(CodeGenContext* ctx, const NativeValue& arr, + const NativeValue& key) const { + return absl::UnimplementedError( + absl::StrCat("extract element unimplemented for ", GetLlvmObjectString(struct_type_))); +} + +void StructTypeIRBuilder::EnsureOK() const { + assert(struct_type_ != nullptr); + // it's a identified type + assert(!struct_type_->getName().empty()); +} +std::string StructTypeIRBuilder::GetTypeDebugString() const { return GetLlvmObjectString(struct_type_); } + +absl::Status StructTypeIRBuilder::Set(CodeGenContext* ctx, ::llvm::Value* struct_value, + absl::Span<::llvm::Value* const> members) const { + if (ctx == nullptr || struct_value == nullptr) { + return absl::InvalidArgumentError("ctx or struct pointer is null"); + } + + if (!IsStructPtr(struct_value->getType())) { + return absl::InvalidArgumentError( + absl::StrCat("value not a struct pointer: ", GetLlvmObjectString(struct_value->getType()))); + } + + if (struct_value->getType()->getPointerElementType() != struct_type_) { + return absl::InvalidArgumentError(absl::Substitute("input value has different type, expect $0 but got $1", + GetLlvmObjectString(struct_type_), + GetLlvmObjectString(struct_value->getType()))); + } + + if (members.size() != struct_type_->getNumElements()) { + return absl::InvalidArgumentError(absl::Substitute("struct $0 requires exact $1 member, but got $2", + GetLlvmObjectString(struct_type_), + struct_type_->getNumElements(), members.size())); + } + + for (unsigned idx = 0; idx < struct_type_->getNumElements(); ++idx) { + auto ele_type = struct_type_->getElementType(idx); + if (ele_type != members[idx]->getType()) { + return absl::InvalidArgumentError(absl::Substitute("$0th member: expect $1 but got $2", idx, + GetLlvmObjectString(ele_type), + GetLlvmObjectString(members[idx]->getType()))); + } + ::llvm::Value* value_ptr = ctx->GetBuilder()->CreateStructGEP(struct_type_, struct_value, idx); + ctx->GetBuilder()->CreateStore(members[idx], value_ptr); + } + + return absl::OkStatus(); +} + } // namespace codegen } // namespace hybridse diff --git a/hybridse/src/codegen/struct_ir_builder.h b/hybridse/src/codegen/struct_ir_builder.h index e197665855b..f9b6ca30731 100644 --- a/hybridse/src/codegen/struct_ir_builder.h +++ b/hybridse/src/codegen/struct_ir_builder.h @@ -17,6 +17,8 @@ #ifndef HYBRIDSE_SRC_CODEGEN_STRUCT_IR_BUILDER_H_ #define HYBRIDSE_SRC_CODEGEN_STRUCT_IR_BUILDER_H_ +#include + #include "absl/status/statusor.h" #include "base/fe_status.h" #include "codegen/native_value.h" @@ -27,20 +29,46 @@ namespace codegen { class StructTypeIRBuilder : public TypeIRBuilder { public: + // TODO(ace): construct with CodeGenContext instead of llvm::Module explicit StructTypeIRBuilder(::llvm::Module*); ~StructTypeIRBuilder(); static StructTypeIRBuilder* CreateStructTypeIRBuilder(::llvm::Module*, ::llvm::Type*); static bool StructCopyFrom(::llvm::BasicBlock* block, ::llvm::Value* src, ::llvm::Value* dist); - virtual void InitStructType() = 0; virtual bool CopyFrom(::llvm::BasicBlock* block, ::llvm::Value* src, ::llvm::Value* dist) = 0; virtual base::Status CastFrom(::llvm::BasicBlock* block, const NativeValue& src, NativeValue* output) = 0; - virtual bool CreateDefault(::llvm::BasicBlock* block, ::llvm::Value** output) = 0; + // construct the default null safe struct absl::StatusOr CreateNull(::llvm::BasicBlock* block); - ::llvm::Type* GetType(); - bool Create(::llvm::BasicBlock* block, ::llvm::Value** output) const; + + virtual bool CreateDefault(::llvm::BasicBlock* block, ::llvm::Value** output) = 0; + + // Allocate and Initialize the struct value from args, each element in list represent exact argument in SQL literal. + // So for map data type, we create it in SQL with `map(key1, value1, ...)`, args is key or value for the result map + virtual absl::StatusOr Construct(CodeGenContext* ctx, absl::Span args) const; + + // construct struct value from llvm values, each element in list represent exact + // llvm struct field at that index + virtual absl::StatusOr<::llvm::Value*> ConstructFromRaw(CodeGenContext* ctx, + absl::Span<::llvm::Value* const> args) const; + + // Extract element value from composite data type + // 1. extract from array type by index + // 2. extract from struct type by field name + // 3. extract from map type by key + virtual absl::StatusOr ExtractElement(CodeGenContext* ctx, const NativeValue& arr, + const NativeValue& key) const; + + ::llvm::Type* GetType() const; + + std::string GetTypeDebugString() const; + + protected: + virtual void InitStructType() = 0; + + // allocate the given struct on current stack, no initialization + bool Allocate(::llvm::BasicBlock* block, ::llvm::Value** output) const; // Load the 'idx' th field into ''*output' // NOTE: not all types are loaded correctly, e.g for array type @@ -50,9 +78,13 @@ class StructTypeIRBuilder : public TypeIRBuilder { // Get the address of 'idx' th field bool Get(::llvm::BasicBlock* block, ::llvm::Value* struct_value, unsigned int idx, ::llvm::Value** output) const; + absl::Status Set(CodeGenContext* ctx, ::llvm::Value* struct_value, absl::Span<::llvm::Value* const> members) const; + + void EnsureOK() const; + protected: ::llvm::Module* m_; - ::llvm::Type* struct_type_; + ::llvm::StructType* struct_type_; }; } // namespace codegen } // namespace hybridse diff --git a/hybridse/src/codegen/timestamp_ir_builder.cc b/hybridse/src/codegen/timestamp_ir_builder.cc index c3a8054e1cd..a07c29ee3de 100644 --- a/hybridse/src/codegen/timestamp_ir_builder.cc +++ b/hybridse/src/codegen/timestamp_ir_builder.cc @@ -267,7 +267,7 @@ bool TimestampIRBuilder::NewTimestamp(::llvm::BasicBlock* block, return false; } ::llvm::Value* timestamp; - if (!Create(block, ×tamp)) { + if (!Allocate(block, ×tamp)) { return false; } if (!SetTs(block, timestamp, @@ -286,7 +286,7 @@ bool TimestampIRBuilder::NewTimestamp(::llvm::BasicBlock* block, return false; } ::llvm::Value* timestamp; - if (!Create(block, ×tamp)) { + if (!Allocate(block, ×tamp)) { return false; } if (!SetTs(block, timestamp, ts)) { diff --git a/hybridse/src/codegen/type_ir_builder.cc b/hybridse/src/codegen/type_ir_builder.cc index 07adfb21855..0cba6015b9d 100644 --- a/hybridse/src/codegen/type_ir_builder.cc +++ b/hybridse/src/codegen/type_ir_builder.cc @@ -103,11 +103,7 @@ bool TypeIRBuilder::IsStringPtr(::llvm::Type* type) { } bool TypeIRBuilder::IsStructPtr(::llvm::Type* type) { - if (type->getTypeID() == ::llvm::Type::PointerTyID) { - type = reinterpret_cast<::llvm::PointerType*>(type)->getElementType(); - return type->isStructTy(); - } - return false; + return type->isPointerTy() && type->getPointerElementType()->isStructTy(); } base::Status TypeIRBuilder::UnaryOpTypeInfer( diff --git a/hybridse/src/codegen/udf_ir_builder.cc b/hybridse/src/codegen/udf_ir_builder.cc index 5030f3cd8ae..c9f613e5748 100644 --- a/hybridse/src/codegen/udf_ir_builder.cc +++ b/hybridse/src/codegen/udf_ir_builder.cc @@ -16,6 +16,8 @@ #include "codegen/udf_ir_builder.h" +#include +#include #include #include "codegen/context.h" @@ -172,7 +174,7 @@ Status UdfIRBuilder::BuildCodeGenUdfCall( } NativeValue gen_output; - CHECK_STATUS(gen_impl->gen(ctx_, args, &gen_output)); + CHECK_STATUS(gen_impl->gen(ctx_, args, {fn->GetReturnType(), fn->IsReturnNullable()}, &gen_output)); if (ret_null != nullptr) { if (gen_output.IsNullable()) { diff --git a/hybridse/src/node/expr_node.cc b/hybridse/src/node/expr_node.cc index 44acc336cef..8ad099a98b4 100644 --- a/hybridse/src/node/expr_node.cc +++ b/hybridse/src/node/expr_node.cc @@ -19,8 +19,6 @@ #include "absl/strings/str_cat.h" #include "absl/strings/substitute.h" #include "codec/fe_row_codec.h" -#include "codegen/arithmetic_expr_ir_builder.h" -#include "codegen/type_ir_builder.h" #include "node/node_manager.h" #include "node/sql_node.h" #include "passes/expression/expr_pass.h" @@ -210,18 +208,26 @@ Status ExprNode::IsCastAccept(node::NodeManager* nm, const TypeNode* src, // this handles compatible type when both lhs and rhs are basic types // composited types like array, list, tuple are not handled correctly, so do not expect the function to handle those -// types absl::StatusOr ExprNode::CompatibleType(NodeManager* nm, const TypeNode* lhs, const TypeNode* rhs) { if (*lhs == *rhs) { // include Null = Null return rhs; } + + if (lhs->base() == kVoid && rhs->base() == kNull) { + return lhs; + } + + if (lhs->base() == kNull && rhs->base() == kVoid) { + return rhs; + } + if (lhs->IsNull()) { - // NULL + T -> T + // NULL/VOID + T -> T return rhs; } if (rhs->IsNull()) { - // T + NULL -> T + // T + NULL/VOID -> T return lhs; } @@ -845,21 +851,15 @@ Status ArrayExpr::InferAttr(ExprAnalysisContext* ctx) { return Status::OK(); } - // auto top_type = ctx->node_manager()->MakeTypeNode(kArray); TypeNode* top_type = nullptr; auto nm = ctx->node_manager(); - if (children_.empty()) { - FAIL_STATUS(kTypeError, "element type unknown for empty array expression"); - } else { - const TypeNode* ele_type = children_[0]->GetOutputType(); - for (size_t i = 1; i < children_.size() ; ++i) { - auto res = CompatibleType(ctx->node_manager(), ele_type, children_[i]->GetOutputType()); - CHECK_TRUE(res.ok(), kTypeError, res.status()); - ele_type = res.value(); - } - CHECK_TRUE(!ele_type->IsNull(), kTypeError, "unable to infer array type, all elements are null"); - top_type = nm->MakeArrayType(ele_type, children_.size()); + const TypeNode* ele_type = nm->MakeNode(); // void type + for (size_t i = 0; i < children_.size(); ++i) { + auto res = CompatibleType(ctx->node_manager(), ele_type, children_[i]->GetOutputType()); + CHECK_TRUE(res.ok(), kTypeError, res.status()); + ele_type = res.value(); } + top_type = nm->MakeArrayType(ele_type, children_.size()); SetOutputType(top_type); // array is nullable SetNullable(true); @@ -1142,5 +1142,50 @@ ExprNode* ExprNode::DeepCopy(NodeManager* nm) const { return root; } +ArrayElementExpr::ArrayElementExpr(ExprNode* array, ExprNode* pos) : ExprNode(kExprArrayElement) { + AddChild(array); + AddChild(pos); +} + +void ArrayElementExpr::Print(std::ostream& output, const std::string& org_tab) const { + // Print for ExprNode just talk too much, I don't intend impl that + // GetExprString is much simpler + output << org_tab << GetExprString(); +} + +const std::string ArrayElementExpr::GetExprString() const { + return absl::StrCat(array()->GetExprString(), "[", position()->GetExprString(), "]"); +} + +ArrayElementExpr* ArrayElementExpr::ShadowCopy(NodeManager* nm) const { + return nm->MakeNode(array(), position()); +} + +Status ArrayElementExpr::InferAttr(ExprAnalysisContext* ctx) { + auto* arr_type = array()->GetOutputType(); + auto* pos_type = position()->GetOutputType(); + + if (arr_type->IsMap()) { + auto map_type = arr_type->GetAsOrNull(); + CHECK_TRUE(node::ExprNode::IsSafeCast(pos_type, map_type->key_type()), common::kTypeError, + "incompatiable key type for ArrayElement, expect ", map_type->key_type()->DebugString(), ", got ", + pos_type->DebugString()); + + SetOutputType(map_type->value_type()); + SetNullable(map_type->value_nullable()); + } else if (arr_type->IsArray()) { + CHECK_TRUE(pos_type->IsInteger(), common::kTypeError, + "index type mismatch for ArrayElement, expect integer, got ", pos_type->DebugString()); + CHECK_TRUE(arr_type->GetGenericSize() == 1, common::kTypeError, "internal error: array of empty T"); + + SetOutputType(arr_type->GetGenericType(0)); + SetNullable(arr_type->IsGenericNullable(0)); + } else { + FAIL_STATUS(common::kTypeError, "can't get element from ", arr_type->DebugString(), ", expect map or array"); + } + return {}; +} +ExprNode *ArrayElementExpr::array() const { return GetChild(0); } +ExprNode *ArrayElementExpr::position() const { return GetChild(1); } } // namespace node } // namespace hybridse diff --git a/hybridse/src/node/node_manager.cc b/hybridse/src/node/node_manager.cc index 86d51249e19..5b1d18e5973 100644 --- a/hybridse/src/node/node_manager.cc +++ b/hybridse/src/node/node_manager.cc @@ -484,12 +484,6 @@ SqlNode *NodeManager::MakeColumnIndexNode(SqlNodeList *keys, SqlNode *ts, SqlNod return RegisterNode(node_ptr); } -SqlNode *NodeManager::MakeColumnDescNode(const std::string &column_name, const DataType data_type, bool op_not_null, - ExprNode *default_value) { - SqlNode *node_ptr = new ColumnDefNode(column_name, data_type, op_not_null, default_value); - return RegisterNode(node_ptr); -} - SqlNodeList *NodeManager::MakeNodeList() { SqlNodeList *new_list_ptr = new SqlNodeList(); RegisterNode(new_list_ptr); @@ -792,9 +786,10 @@ AllNode *NodeManager::MakeAllNode(const std::string &relation_name, const std::s } SqlNode *NodeManager::MakeInsertTableNode(const std::string &db_name, const std::string &table_name, - const ExprListNode *columns_expr, const ExprListNode *values) { + const ExprListNode *columns_expr, const ExprListNode *values, + InsertStmt::InsertMode insert_mode) { if (nullptr == columns_expr) { - InsertStmt *node_ptr = new InsertStmt(db_name, table_name, values->children_); + InsertStmt *node_ptr = new InsertStmt(db_name, table_name, values->children_, insert_mode); return RegisterNode(node_ptr); } else { std::vector column_names; @@ -811,7 +806,7 @@ SqlNode *NodeManager::MakeInsertTableNode(const std::string &db_name, const std: } } } - InsertStmt *node_ptr = new InsertStmt(db_name, table_name, column_names, values->children_); + InsertStmt *node_ptr = new InsertStmt(db_name, table_name, column_names, values->children_, insert_mode); return RegisterNode(node_ptr); } } diff --git a/hybridse/src/node/plan_node.cc b/hybridse/src/node/plan_node.cc index f601696e605..c829ab880e5 100644 --- a/hybridse/src/node/plan_node.cc +++ b/hybridse/src/node/plan_node.cc @@ -224,6 +224,10 @@ std::string NameOfPlanNodeType(const PlanType &type) { return "kPlanTypeShow"; case kPlanTypeAlterTable: return "kPlanTypeAlterTable"; + case kPlanTypeCreateUser: + return "kPlanTypeCreateUser"; + case kPlanTypeAlterUser: + return "kPlanTypeAlterUser"; case kUnknowPlan: return std::string("kUnknow"); } @@ -708,6 +712,28 @@ void DeployPlanNode::Print(std::ostream &output, const std::string &tab) const { PrintSqlNode(output, new_tab, Stmt(), "stmt", true); } +void CreateUserPlanNode::Print(std::ostream &output, const std::string &tab) const { + PlanNode::Print(output, tab); + output << "\n"; + std::string new_tab = tab + INDENT; + PrintValue(output, new_tab, IfNotExists() ? "true": "false", "if_not_exists", false); + output << "\n"; + PrintValue(output, new_tab, Name(), "name", false); + output << "\n"; + PrintValue(output, new_tab, Options().get(), "options", true); +} + +void AlterUserPlanNode::Print(std::ostream &output, const std::string &tab) const { + PlanNode::Print(output, tab); + output << "\n"; + std::string new_tab = tab + INDENT; + PrintValue(output, new_tab, IfExists() ? "true": "false", "if_exists", false); + output << "\n"; + PrintValue(output, new_tab, Name(), "name", false); + output << "\n"; + PrintValue(output, new_tab, Options().get(), "options", true); +} + void LoadDataPlanNode::Print(std::ostream &output, const std::string &org_tab) const { PlanNode::Print(output, org_tab); diff --git a/hybridse/src/node/plan_node_test.cc b/hybridse/src/node/plan_node_test.cc index 5ffb76142a7..aac111f8bf3 100644 --- a/hybridse/src/node/plan_node_test.cc +++ b/hybridse/src/node/plan_node_test.cc @@ -234,11 +234,12 @@ TEST_F(PlanNodeTest, ExtractColumnsAndIndexsTest) { index_node->SetName("index1"); CreatePlanNode *node = manager_->MakeCreateTablePlanNode( "", "t1", - {manager_->MakeColumnDescNode("col1", node::kInt32, true), - manager_->MakeColumnDescNode("col2", node::kInt32, true), - manager_->MakeColumnDescNode("col3", node::kFloat, true), - manager_->MakeColumnDescNode("col4", node::kVarchar, true), - manager_->MakeColumnDescNode("col5", node::kTimestamp, true), index_node}, + {manager_->MakeNode("col1", manager_->MakeNode(node::kInt32, true)), + manager_->MakeNode("col2", manager_->MakeNode(node::kInt32, true)), + manager_->MakeNode("col3", manager_->MakeNode(node::kFloat, true)), + manager_->MakeNode("col4", manager_->MakeNode(node::kVarchar, true)), + manager_->MakeNode("col5", manager_->MakeNode(node::kTimestamp, true)), + index_node}, {manager_->MakeReplicaNumNode(3), manager_->MakePartitionNumNode(8), manager_->MakeNode(kMemory)}, false); diff --git a/hybridse/src/node/sql_node.cc b/hybridse/src/node/sql_node.cc index 9114bad2d53..478805c5f05 100644 --- a/hybridse/src/node/sql_node.cc +++ b/hybridse/src/node/sql_node.cc @@ -17,7 +17,6 @@ #include "node/sql_node.h" #include -#include #include #include #include @@ -53,6 +52,8 @@ static absl::flat_hash_map CreateCmdTypeNamesMap() { {CmdType::kCmdShowTables, "show tables"}, {CmdType::kCmdUseDatabase, "use database"}, {CmdType::kCmdDropDatabase, "drop database"}, + {CmdType::kCmdDropUser, "drop user"}, + {CmdType::kCmdShowUser, "show user"}, {CmdType::kCmdCreateDatabase, "create database"}, {CmdType::kCmdDescTable, "desc table"}, {CmdType::kCmdDropTable, "drop table"}, @@ -142,6 +143,7 @@ static absl::flat_hash_map CreateExprTypeNamesMap() {kExprOrderExpression, "order"}, {kExprEscaped, "escape"}, {kExprArray, "array"}, + {kExprArrayElement, "array element"}, }; for (auto kind = 0; kind < ExprType::kExprLast; ++kind) { DCHECK(map.find(static_cast(kind)) != map.end()); @@ -1181,10 +1183,13 @@ static absl::flat_hash_map CreateSqlNodeTypeToNa {kSetStmt, "kSetStmt"}, {kDeleteStmt, "kDeleteStmt"}, {kCreateFunctionStmt, "kCreateFunctionStmt"}, + {kCreateUserStmt, "kCreateUserStmt"}, + {kAlterUserStmt, "kAlterUserStmt"}, {kDynamicUdfFnDef, "kDynamicUdfFnDef"}, {kDynamicUdafFnDef, "kDynamicUdafFnDef"}, {kWithClauseEntry, "kWithClauseEntry"}, {kAlterTableStmt, "kAlterTableStmt"}, + {kColumnSchema, "kColumnSchema"}, }; for (auto kind = 0; kind < SqlNodeType::kSqlNodeTypeLast; ++kind) { DCHECK(map.find(static_cast(kind)) != map.end()) @@ -1454,19 +1459,35 @@ void CreateTableLikeClause::Print(std::ostream &output, const std::string &tab) output << "\n"; } +std::string ColumnSchemaNode::DebugString() const { + auto res = DataTypeName(type()); + if (!generics().empty()) { + absl::StrAppend(&res, "<", + absl::StrJoin(generics(), ", ", + [](std::string *out, const ColumnSchemaNode *in) { + absl::StrAppend(out, in->DebugString()); + }), + ">"); + } + + if (not_null()) { + absl::StrAppend(&res, " NOT NULL"); + } + + if (default_value()) { + absl::StrAppend(&res, " DEFAULT ", default_value()->GetExprString()); + } + + return res; +} + void ColumnDefNode::Print(std::ostream &output, const std::string &org_tab) const { SqlNode::Print(output, org_tab); const std::string tab = org_tab + INDENT + SPACE_ED; output << "\n"; - PrintValue(output, tab, column_name_, "column_name", false); - output << "\n"; - PrintValue(output, tab, DataTypeName(column_type_), "column_type", false); + PrintValue(output, tab, GetColumnName(), "column_name", false); output << "\n"; - PrintValue(output, tab, std::to_string(op_not_null_), "NOT NULL", !default_value_); - if (default_value_) { - output << "\n"; - PrintSqlNode(output, tab, default_value_, "default_value", true); - } + PrintValue(output, tab, schema_->DebugString(), "column_type", true); } void ColumnIndexNode::SetTTL(ExprListNode *ttl_node_list) { @@ -1627,6 +1648,29 @@ void CreateIndexNode::Print(std::ostream &output, const std::string &org_tab) co output << "\n"; PrintSqlNode(output, tab, index_, "index", true); } + +void CreateUserNode::Print(std::ostream &output, const std::string &org_tab) const { + SqlNode::Print(output, org_tab); + const std::string tab = org_tab + INDENT + SPACE_ED; + output << "\n"; + PrintValue(output, tab, if_not_exists_ ? "true" : "false", "if_not_exists", false); + output << "\n"; + PrintValue(output, tab, name_, "user", false); + output << "\n"; + PrintValue(output, tab, Options().get(), "options", true); +} + +void AlterUserNode::Print(std::ostream &output, const std::string &org_tab) const { + SqlNode::Print(output, org_tab); + const std::string tab = org_tab + INDENT + SPACE_ED; + output << "\n"; + PrintValue(output, tab, if_exists_ ? "true" : "false", "if_exists", false); + output << "\n"; + PrintValue(output, tab, name_, "user", false); + output << "\n"; + PrintValue(output, tab, Options().get(), "options", true); +} + void ExplainNode::Print(std::ostream &output, const std::string &org_tab) const { SqlNode::Print(output, org_tab); const std::string tab = org_tab + INDENT + SPACE_ED; @@ -1995,25 +2039,6 @@ void StructExpr::Print(std::ostream &output, const std::string &org_tab) const { PrintSqlNode(output, tab, methods_, "methods", true); } -void TypeNode::Print(std::ostream &output, const std::string &org_tab) const { - SqlNode::Print(output, org_tab); - const std::string tab = org_tab + INDENT + SPACE_ED; - - output << "\n"; - PrintValue(output, tab, GetName(), "type", true); -} -bool TypeNode::Equals(const SqlNode *node) const { - if (!SqlNode::Equals(node)) { - return false; - } - - const TypeNode *that = dynamic_cast(node); - return this->base_ == that->base_ && - std::equal( - this->generics_.cbegin(), this->generics_.cend(), that->generics_.cbegin(), - [&](const hybridse::node::TypeNode *a, const hybridse::node::TypeNode *b) { return TypeEquals(a, b); }); -} - void JoinNode::Print(std::ostream &output, const std::string &org_tab) const { TableRefNode::Print(output, org_tab); @@ -2729,6 +2754,19 @@ std::string DropPathAction::DebugString() const { return absl::Substitute("DropPathAction ($0)", target_); } +std::string SetOptionsAction::DebugString() const { + std::string output; + for (const auto& kv : *options_) { + if (!output.empty()) { + absl::StrAppend(&output, ", "); + } + absl::StrAppend(&output, kv.first); + absl::StrAppend(&output, "="); + absl::StrAppend(&output, kv.second->GetAsString()); + } + return absl::Substitute("SetOptionsAction ($0)", output); +} + bool SetOperationNode::Equals(const SqlNode *node) const { auto *rhs = dynamic_cast(node); return this->QueryNode::Equals(node) && this->op_type() == rhs->op_type() && this->distinct() == rhs->distinct() && diff --git a/hybridse/src/node/sql_node_test.cc b/hybridse/src/node/sql_node_test.cc index e2938656dcc..67bb861a812 100644 --- a/hybridse/src/node/sql_node_test.cc +++ b/hybridse/src/node/sql_node_test.cc @@ -209,11 +209,11 @@ TEST_F(SqlNodeTest, MakeWindowDefNodetTest) { ExprListNode *partitions = node_manager_->MakeExprList(); ExprNode *ptr1 = node_manager_->MakeColumnRefNode("keycol", ""); - partitions->PushBack(ptr1); + partitions->AddChild(ptr1); ExprNode *ptr2 = node_manager_->MakeOrderExpression(node_manager_->MakeColumnRefNode("col1", ""), true); ExprListNode *orders = node_manager_->MakeExprList(); - orders->PushBack(ptr2); + orders->AddChild(ptr2); int64_t maxsize = 0; SqlNode *frame = @@ -286,29 +286,30 @@ TEST_F(SqlNodeTest, NewFrameNodeTest) { TEST_F(SqlNodeTest, MakeInsertNodeTest) { ExprListNode *column_expr_list = node_manager_->MakeExprList(); ExprNode *ptr1 = node_manager_->MakeColumnRefNode("col1", ""); - column_expr_list->PushBack(ptr1); + column_expr_list->AddChild(ptr1); ExprNode *ptr2 = node_manager_->MakeColumnRefNode("col2", ""); - column_expr_list->PushBack(ptr2); + column_expr_list->AddChild(ptr2); ExprNode *ptr3 = node_manager_->MakeColumnRefNode("col3", ""); - column_expr_list->PushBack(ptr3); + column_expr_list->AddChild(ptr3); ExprNode *ptr4 = node_manager_->MakeColumnRefNode("col4", ""); - column_expr_list->PushBack(ptr4); + column_expr_list->AddChild(ptr4); ExprListNode *value_expr_list = node_manager_->MakeExprList(); ExprNode *value1 = node_manager_->MakeConstNode(1); ExprNode *value2 = node_manager_->MakeConstNode(2.3f); ExprNode *value3 = node_manager_->MakeConstNode(2.3); ExprNode *value4 = node_manager_->MakeParameterExpr(1); - value_expr_list->PushBack(value1); - value_expr_list->PushBack(value2); - value_expr_list->PushBack(value3); - value_expr_list->PushBack(value4); + value_expr_list->AddChild(value1); + value_expr_list->AddChild(value2); + value_expr_list->AddChild(value3); + value_expr_list->AddChild(value4); ExprListNode *insert_values = node_manager_->MakeExprList(); - insert_values->PushBack(value_expr_list); - SqlNode *node_ptr = node_manager_->MakeInsertTableNode("", "t1", column_expr_list, insert_values); + insert_values->AddChild(value_expr_list); + SqlNode *node_ptr = node_manager_->MakeInsertTableNode("", "t1", column_expr_list, insert_values, + InsertStmt::InsertMode::DEFAULT_MODE); ASSERT_EQ(kInsertStmt, node_ptr->GetType()); InsertStmt *insert_stmt = dynamic_cast(node_ptr); @@ -670,11 +671,17 @@ TEST_F(SqlNodeTest, CreateIndexNodeTest) { ColumnIndexNode *index_node = dynamic_cast(node_manager_->MakeColumnIndexNode(index_items)); CreatePlanNode *node = node_manager_->MakeCreateTablePlanNode( "", "t1", - {node_manager_->MakeColumnDescNode("col1", node::kInt32, true), - node_manager_->MakeColumnDescNode("col2", node::kInt32, true), - node_manager_->MakeColumnDescNode("col3", node::kFloat, true), - node_manager_->MakeColumnDescNode("col4", node::kVarchar, true), - node_manager_->MakeColumnDescNode("col5", node::kTimestamp, true), index_node}, + {node_manager_->MakeNode( + "col1", node_manager_->MakeNode(node::kInt32, true, nullptr)), + node_manager_->MakeNode( + "col2", node_manager_->MakeNode(node::kInt32, true, nullptr)), + node_manager_->MakeNode( + "col3", node_manager_->MakeNode(node::kFloat, true, nullptr)), + node_manager_->MakeNode( + "col4", node_manager_->MakeNode(node::kVarchar, true, nullptr)), + node_manager_->MakeNode( + "col5", node_manager_->MakeNode(node::kTimestamp, true, nullptr)), + index_node}, {node_manager_->MakeReplicaNumNode(3), node_manager_->MakePartitionNumNode(8), node_manager_->MakeNode(kMemory)}, false); diff --git a/hybridse/src/node/type_node.cc b/hybridse/src/node/type_node.cc index e0052fca74c..c3c1015ce8f 100644 --- a/hybridse/src/node/type_node.cc +++ b/hybridse/src/node/type_node.cc @@ -20,7 +20,6 @@ #include "absl/strings/str_join.h" #include "absl/strings/str_cat.h" #include "node/node_manager.h" -#include "vm/physical_op.h" namespace hybridse { namespace node { @@ -52,7 +51,11 @@ bool TypeNode::IsTimestamp() const { return base_ == node::kTimestamp; } bool TypeNode::IsString() const { return base_ == node::kVarchar; } bool TypeNode::IsArithmetic() const { return IsInteger() || IsFloating(); } bool TypeNode::IsNumber() const { return IsInteger() || IsFloating(); } -bool TypeNode::IsNull() const { return base_ == node::kNull; } + +// Better function name ? Note the difference of VOID and NULL, VOID is a data type +// while NULL is a placeholder for missing or unknown information, not a real data type. +bool TypeNode::IsNull() const { return base_ == node::kNull || base_ == node::kVoid; } + bool TypeNode::IsBool() const { return base_ == node::kBool; } bool TypeNode::IsIntegral() const { @@ -137,5 +140,89 @@ FixedArrayType *FixedArrayType::ShadowCopy(NodeManager *nm) const { return nm->MakeArrayType(element_type(), num_elements_); } +void TypeNode::AddGeneric(const node::TypeNode *dtype, bool nullable) { + generics_.push_back(dtype); + generics_nullable_.push_back(nullable); +} +const hybridse::node::TypeNode *TypeNode::GetGenericType(size_t idx) const { return generics_[idx]; } +const std::string TypeNode::GetName() const { + std::string type_name = DataTypeName(base_); + if (!generics_.empty()) { + for (auto type : generics_) { + type_name.append("_"); + type_name.append(type->GetName()); + } + } + return type_name; +} + +void TypeNode::Print(std::ostream &output, const std::string &org_tab) const { + SqlNode::Print(output, org_tab); + const std::string tab = org_tab + INDENT + SPACE_ED; + + output << "\n"; + PrintValue(output, tab, GetName(), "type", true); +} +bool TypeNode::Equals(const SqlNode *node) const { + if (!SqlNode::Equals(node)) { + return false; + } + + const TypeNode *that = dynamic_cast(node); + return this->base_ == that->base_ && + std::equal( + this->generics_.cbegin(), this->generics_.cend(), that->generics_.cbegin(), + [&](const hybridse::node::TypeNode *a, const hybridse::node::TypeNode *b) { return TypeEquals(a, b); }); +} + +const std::string OpaqueTypeNode::GetName() const { return "opaque<" + std::to_string(bytes_) + ">"; } + +MapType::MapType(const TypeNode *key_ty, const TypeNode *value_ty, bool value_not_null) : TypeNode(node::kMap) { + // map key does not accept null, value is nullable unless extra attributes specified + AddGeneric(key_ty, false); + AddGeneric(value_ty, !value_not_null); +} +MapType::~MapType() {} +const TypeNode *MapType::key_type() const { return GetGenericType(0); } +const TypeNode *MapType::value_type() const { return GetGenericType(1); } +bool MapType::value_nullable() const { return IsGenericNullable(1); } + +// MAP +// 1. ALL KEYs or VALUEs must share a least common type. +// 2. KEY is simple type only: void/bool/numeric/data/timestamp/string +// 3. Resolve to MAP if arguments is empty +absl::StatusOr MapType::InferMapType(NodeManager* nm, absl::Span types) { + if (types.size() % 2 != 0) { + return absl::InvalidArgumentError("map expects a positive even number of arguments"); + } + + const node::TypeNode* key = nm->MakeNode(); // void type + const node::TypeNode* value = nm->MakeNode(); // void type + for (size_t i = 0; i < types.size(); i += 2) { + if (!types[i].type()->IsBaseOrNullType()) { + return absl::FailedPreconditionError( + absl::StrCat("key type for map should be void/bool/numeric/data/timestamp/string only, got ", + types[i].type()->DebugString())); + } + auto key_res = node::ExprNode::CompatibleType(nm, key, types[i].type()); + if (!key_res.ok()) { + return key_res.status(); + } + key = key_res.value(); + auto value_res = node::ExprNode::CompatibleType(nm, value, types[i + 1].type()); + if (!value_res.ok()) { + return value_res.status(); + } + value = value_res.value(); + } + + if (!types.empty() && (key->base() == kVoid || value->base() == kVoid)) { + // only empty map resolved to MAP + return absl::FailedPreconditionError("KEY/VALUE type of non-empty map can't be VOID"); + } + + return nm->MakeNode(key, value); +} + } // namespace node } // namespace hybridse diff --git a/hybridse/src/passes/lambdafy_projects.h b/hybridse/src/passes/lambdafy_projects.h index 3371cd12902..6afed956ee3 100644 --- a/hybridse/src/passes/lambdafy_projects.h +++ b/hybridse/src/passes/lambdafy_projects.h @@ -17,16 +17,12 @@ #ifndef HYBRIDSE_SRC_PASSES_LAMBDAFY_PROJECTS_H_ #define HYBRIDSE_SRC_PASSES_LAMBDAFY_PROJECTS_H_ -#include #include #include #include #include "node/expr_node.h" -#include "node/plan_node.h" #include "node/sql_node.h" -#include "udf/udf_library.h" -#include "vm/schemas_context.h" namespace hybridse { namespace passes { diff --git a/hybridse/src/plan/planner.cc b/hybridse/src/plan/planner.cc index 164dba11f2b..5553f7c7c99 100644 --- a/hybridse/src/plan/planner.cc +++ b/hybridse/src/plan/planner.cc @@ -345,7 +345,8 @@ base::Status Planner::CreateSelectQueryPlan(const node::SelectQueryNode *root, n return base::Status::OK(); } -base::Status Planner::CreateSetOperationPlan(const node::SetOperationNode *root, node::SetOperationPlanNode **plan_tree) { +base::Status Planner::CreateSetOperationPlan(const node::SetOperationNode *root, + node::SetOperationPlanNode **plan_tree) { CHECK_TRUE(nullptr != root, common::kPlanError, "can not create query plan node with null query node") auto list = node_manager_->MakeList(); @@ -758,6 +759,20 @@ base::Status SimplePlanner::CreatePlanTree(const NodePointVector &parser_trees, plan_trees.push_back(deploy_plan_node); break; } + case ::hybridse::node::kCreateUserStmt: { + auto node = dynamic_cast(parser_tree); + auto create_user_plan_node = node_manager_->MakeNode(node->Name(), + node->IfNotExists(), node->Options()); + plan_trees.push_back(create_user_plan_node); + break; + } + case ::hybridse::node::kAlterUserStmt: { + auto node = dynamic_cast(parser_tree); + auto alter_user_plan_node = node_manager_->MakeNode(node->Name(), + node->IfExists(), node->Options()); + plan_trees.push_back(alter_user_plan_node); + break; + } case ::hybridse::node::kSetStmt: { CHECK_TRUE(is_batch_mode_, common::kPlanError, "Non-support SET Op in online serving"); diff --git a/hybridse/src/plan/planner.h b/hybridse/src/plan/planner.h index 731663ab246..6da3068fdd8 100644 --- a/hybridse/src/plan/planner.h +++ b/hybridse/src/plan/planner.h @@ -49,6 +49,7 @@ class Planner { virtual ~Planner() {} virtual base::Status CreatePlanTree(const NodePointVector &parser_trees, PlanNodeList &plan_trees) = 0; // NOLINT (runtime/references) + static base::Status TransformTableDef(const std::string &table_name, const NodePointVector &column_desc_list, type::TableDef *table); bool MergeWindows(const std::map &map, @@ -132,11 +133,11 @@ class SimplePlanner : public Planner { bool enable_batch_window_parallelization = true, const std::unordered_map* extra_options = nullptr) : Planner(manager, is_batch_mode, is_cluster_optimized, enable_batch_window_parallelization, extra_options) {} - ~SimplePlanner() {} + ~SimplePlanner() override {} protected: base::Status CreatePlanTree(const NodePointVector &parser_trees, - PlanNodeList &plan_trees); // NOLINT + PlanNodeList &plan_trees) override; // NOLINT }; } // namespace plan diff --git a/hybridse/src/planv2/ast_node_converter.cc b/hybridse/src/planv2/ast_node_converter.cc index 5d9eb939113..0261c673423 100644 --- a/hybridse/src/planv2/ast_node_converter.cc +++ b/hybridse/src/planv2/ast_node_converter.cc @@ -25,8 +25,10 @@ #include "absl/strings/match.h" #include "absl/types/span.h" #include "base/fe_status.h" +#include "node/sql_node.h" #include "udf/udf.h" #include "zetasql/parser/ast_node_kind.h" +#include "zetasql/parser/parse_tree_manual.h" namespace hybridse { namespace plan { @@ -57,6 +59,10 @@ static base::Status ConvertAlterTableStmt(const zetasql::ASTAlterTableStatement* node::SqlNode** out); static base::Status ConvertSetOperation(const zetasql::ASTSetOperation* stmt, node::NodeManager* nm, node::SetOperationNode** out); +static base::Status ConvertSchemaNode(const zetasql::ASTColumnSchema* stmt, node::NodeManager* nm, + node::ColumnSchemaNode** out); +static base::Status ConvertArrayElement(const zetasql::ASTArrayElement* expr, node::NodeManager* nm, + node::ArrayElementExpr** out); /// Used to convert zetasql ASTExpression Node into our ExprNode base::Status ConvertExprNode(const zetasql::ASTExpression* ast_expression, node::NodeManager* node_manager, @@ -107,6 +113,13 @@ base::Status ConvertExprNode(const zetasql::ASTExpression* ast_expression, node: } return base::Status::OK(); } + case zetasql::AST_ARRAY_ELEMENT: { + node::ArrayElementExpr* expr = nullptr; + CHECK_STATUS( + ConvertGuard(ast_expression, node_manager, &expr, ConvertArrayElement)); + *output = expr; + return base::Status::OK(); + } case zetasql::AST_CASE_VALUE_EXPRESSION: { auto* case_expression = ast_expression->GetAsOrDie(); auto& arguments = case_expression->arguments(); @@ -123,7 +136,7 @@ base::Status ConvertExprNode(const zetasql::ASTExpression* ast_expression, node: node::ExprNode* then_expr = nullptr; CHECK_STATUS(ConvertExprNode(arguments[i], node_manager, &when_expr)) CHECK_STATUS(ConvertExprNode(arguments[i + 1], node_manager, &then_expr)) - when_list_expr->PushBack(node_manager->MakeWhenNode(when_expr, then_expr)); + when_list_expr->AddChild(node_manager->MakeWhenNode(when_expr, then_expr)); i += 2; } else { CHECK_STATUS(ConvertExprNode(arguments[i], node_manager, &else_expr)) @@ -147,7 +160,7 @@ base::Status ConvertExprNode(const zetasql::ASTExpression* ast_expression, node: node::ExprNode* then_expr = nullptr; CHECK_STATUS(ConvertExprNode(arguments[i], node_manager, &when_expr)) CHECK_STATUS(ConvertExprNode(arguments[i + 1], node_manager, &then_expr)) - when_list_expr->PushBack(node_manager->MakeWhenNode(when_expr, then_expr)); + when_list_expr->AddChild(node_manager->MakeWhenNode(when_expr, then_expr)); i += 2; } else { CHECK_STATUS(ConvertExprNode(arguments[i], node_manager, &else_expr)) @@ -653,6 +666,17 @@ base::Status ConvertStatement(const zetasql::ASTStatement* statement, node::Node dynamic_cast(node_manager->MakeCmdNode(node::CmdType::kCmdDescTable, names)); break; } + case zetasql::AST_DROP_USER_STATEMENT: { + auto drop_user_statement = statement->GetAsOrNull(); + CHECK_TRUE(drop_user_statement != nullptr, common::kSqlAstError, "not an ASTDropUserStatement"); + CHECK_TRUE(drop_user_statement->name() != nullptr, common::kSqlAstError, "invalid drop user statement"); + std::string user_name; + CHECK_STATUS(AstPathExpressionToString(drop_user_statement->name(), &user_name)); + auto node = dynamic_cast(node_manager->MakeCmdNode(node::CmdType::kCmdDropUser, user_name)); + node->SetIfExists(drop_user_statement->is_if_exists()); + *output = node; + break; + } case zetasql::AST_DROP_STATEMENT: { const zetasql::ASTDropStatement* drop_statement = statement->GetAsOrNull(); CHECK_TRUE(nullptr != drop_statement->name(), common::kSqlAstError, "not an ASTDropStatement") @@ -683,6 +707,22 @@ base::Status ConvertStatement(const zetasql::ASTStatement* statement, node::Node *output = create_index_node; break; } + case zetasql::AST_CREATE_USER_STATEMENT: { + const zetasql::ASTCreateUserStatement* create_user_stmt = + statement->GetAsOrNull(); + node::CreateUserNode* create_user_node = nullptr; + CHECK_STATUS(ConvertCreateUserStatement(create_user_stmt, node_manager, &create_user_node)) + *output = create_user_node; + break; + } + case zetasql::AST_ALTER_USER_STATEMENT: { + const zetasql::ASTAlterUserStatement* alter_user_stmt = + statement->GetAsOrNull(); + node::AlterUserNode* alter_user_node = nullptr; + CHECK_STATUS(ConvertAlterUserStatement(alter_user_stmt, node_manager, &alter_user_node)) + *output = alter_user_node; + break; + } case zetasql::AST_USE_STATEMENT: { const auto use_stmt = statement->GetAsOrNull(); CHECK_TRUE(nullptr != use_stmt, common::kSqlAstError, "not an ASTUseStatement"); @@ -1475,9 +1515,7 @@ base::Status ConvertCreateProcedureNode(const zetasql::ASTCreateProcedureStateme } // case element -// ASTColumnDefinition -> case element.schema -// ASSTSimpleColumnSchema -> ColumnDeefNode -// otherwise -> not implemented +// ASTColumnDefinition -> ColumnDefNode // ASTIndexDefinition -> ColumnIndexNode // otherwise -> not implemented base::Status ConvertTableElement(const zetasql::ASTTableElement* element, node::NodeManager* node_manager, @@ -1489,38 +1527,10 @@ base::Status ConvertTableElement(const zetasql::ASTTableElement* element, node:: auto column_def = element->GetAsOrNull(); CHECK_TRUE(column_def != nullptr, common::kSqlAstError, "not an ASTColumnDefinition"); - auto not_null_columns = column_def->schema()->FindAttributes( - zetasql::AST_NOT_NULL_COLUMN_ATTRIBUTE); - bool not_null = !not_null_columns.empty(); - const std::string name = column_def->name()->GetAsString(); - - auto kind = column_def->schema()->node_kind(); - switch (kind) { - case zetasql::AST_SIMPLE_COLUMN_SCHEMA: { - // only simple column schema is supported - auto simple_column_schema = column_def->schema()->GetAsOrNull(); - CHECK_TRUE(simple_column_schema != nullptr, common::kSqlAstError, "not and ASTSimpleColumnSchema"); - - std::string type_name = ""; - CHECK_STATUS(AstPathExpressionToString(simple_column_schema->type_name(), &type_name)) - node::DataType type; - CHECK_STATUS(node::StringToDataType(type_name, &type)); - - node::ExprNode* default_value = nullptr; - if (simple_column_schema->default_expression()) { - CHECK_STATUS( - ConvertExprNode(simple_column_schema->default_expression(), node_manager, &default_value)); - } - - *node = node_manager->MakeColumnDescNode(name, type, not_null, default_value); - return base::Status::OK(); - } - default: { - return base::Status(common::kSqlAstError, absl::StrCat("unsupported column schema type: ", - zetasql::ASTNode::NodeKindToString(kind))); - } - } + node::ColumnSchemaNode* schema = nullptr; + CHECK_STATUS(ConvertSchemaNode(column_def->schema(), node_manager, &schema)); + *node = node_manager->MakeNode(name, schema); break; } case zetasql::AST_INDEX_DEFINITION: { @@ -1528,13 +1538,14 @@ base::Status ConvertTableElement(const zetasql::ASTTableElement* element, node:: node::ColumnIndexNode* index_node = nullptr; CHECK_STATUS(ConvertColumnIndexNode(ast_index_node, node_manager, &index_node)); *node = index_node; - return base::Status::OK(); + break; } default: { return base::Status(common::kSqlAstError, absl::StrCat("unsupported table column elemnt: ", element->GetNodeKindString())); } } + return base::Status::OK(); } // ASTIndexDefinition node @@ -1628,14 +1639,14 @@ base::Status ConvertIndexOption(const zetasql::ASTOptionsEntry* entry, node::Nod node::DataType unit; CHECK_STATUS(ASTIntervalLIteralToNum(entry->value(), &value, &unit)); auto node = node_manager->MakeConstNode(value, unit); - ttl_list->PushBack(node); + ttl_list->AddChild(node); break; } case zetasql::AST_INT_LITERAL: { int64_t value; CHECK_STATUS(ASTIntLiteralToNum(entry->value(), &value)); auto node = node_manager->MakeConstNode(value, node::kLatest); - ttl_list->PushBack(node); + ttl_list->AddChild(node); break; } case zetasql::AST_STRUCT_CONSTRUCTOR_WITH_PARENS: { @@ -1649,11 +1660,11 @@ base::Status ConvertIndexOption(const zetasql::ASTOptionsEntry* entry, node::Nod CHECK_STATUS(ASTIntervalLIteralToNum(struct_parens->field_expression(0), &value, &unit)); auto node = node_manager->MakeConstNode(value, unit); - ttl_list->PushBack(node); + ttl_list->AddChild(node); value = 0; CHECK_STATUS(ASTIntLiteralToNum(struct_parens->field_expression(1), &value)); - ttl_list->PushBack(node_manager->MakeConstNode(value, node::kLatest)); + ttl_list->AddChild(node_manager->MakeConstNode(value, node::kLatest)); break; } default: { @@ -1962,8 +1973,9 @@ base::Status ConvertInsertStatement(const zetasql::ASTInsertStatement* root, nod } CHECK_TRUE(nullptr == root->query(), common::kSqlAstError, "Un-support insert statement with query"); - CHECK_TRUE(zetasql::ASTInsertStatement::InsertMode::DEFAULT_MODE == root->insert_mode(), common::kSqlAstError, - "Un-support insert mode ", root->GetSQLForInsertMode()); + CHECK_TRUE(zetasql::ASTInsertStatement::InsertMode::DEFAULT_MODE == root->insert_mode() || + zetasql::ASTInsertStatement::InsertMode::IGNORE == root->insert_mode(), + common::kSqlAstError, "Un-support insert mode ", root->GetSQLForInsertMode()); CHECK_TRUE(nullptr == root->returning(), common::kSqlAstError, "Un-support insert statement with return clause currently", root->GetSQLForInsertMode()); CHECK_TRUE(nullptr == root->assert_rows_modified(), common::kSqlAstError, @@ -1972,7 +1984,7 @@ base::Status ConvertInsertStatement(const zetasql::ASTInsertStatement* root, nod node::ExprListNode* column_list = node_manager->MakeExprList(); if (nullptr != root->column_list()) { for (auto column : root->column_list()->identifiers()) { - column_list->PushBack(node_manager->MakeColumnRefNode(column->GetAsString(), "")); + column_list->AddChild(node_manager->MakeColumnRefNode(column->GetAsString(), "")); } } @@ -2000,8 +2012,8 @@ base::Status ConvertInsertStatement(const zetasql::ASTInsertStatement* root, nod if (names.size() == 2) { db_name = names[0]; } - *output = - dynamic_cast(node_manager->MakeInsertTableNode(db_name, table_name, column_list, rows)); + *output = dynamic_cast(node_manager->MakeInsertTableNode( + db_name, table_name, column_list, rows, static_cast(root->insert_mode()))); return base::Status::OK(); } base::Status ConvertDropStatement(const zetasql::ASTDropStatement* root, node::NodeManager* node_manager, @@ -2071,6 +2083,44 @@ base::Status ConvertDropStatement(const zetasql::ASTDropStatement* root, node::N } return base::Status::OK(); } + +base::Status ConvertCreateUserStatement(const zetasql::ASTCreateUserStatement* root, node::NodeManager* node_manager, + node::CreateUserNode** output) { + CHECK_TRUE(root != nullptr, common::kSqlAstError, "not an ASTCreateUserStatement") + std::string user_name; + CHECK_TRUE(root->name() != nullptr, common::kSqlAstError, "can't create user without user name"); + CHECK_STATUS(AstPathExpressionToString(root->name(), &user_name)); + + auto options = std::make_shared(); + if (root->options_list() != nullptr) { + CHECK_STATUS(ConvertAstOptionsListToMap(root->options_list(), node_manager, options)); + } + *output = node_manager->MakeNode(user_name, root->is_if_not_exists(), options); + return base::Status::OK(); +} + +base::Status ConvertAlterUserStatement(const zetasql::ASTAlterUserStatement* root, node::NodeManager* node_manager, + node::AlterUserNode** output) { + CHECK_TRUE(root != nullptr, common::kSqlAstError, "not an ASTAlterUserStatement") + std::string user_name; + CHECK_TRUE(root->path() != nullptr, common::kSqlAstError, "can't alter user without user name"); + CHECK_STATUS(AstPathExpressionToString(root->path(), &user_name)); + std::vector actions; + if (root->action_list() != nullptr) { + for (auto &ac : root->action_list()->actions()) { + node::AlterActionBase *ac_out = nullptr; + CHECK_STATUS(convertAlterAction(ac, node_manager, &ac_out)); + actions.push_back(ac_out); + } + } + CHECK_TRUE(actions.size() == 1, common::kSqlAstError, "only one action is permitted"); + CHECK_TRUE(actions.front()->kind() == node::AlterActionBase::ActionKind::SET_OPTIONS, + common::kSqlAstError, "it should be set options"); + *output = node_manager->MakeNode(user_name, root->is_if_exists(), + (dynamic_cast(actions.front()))->Options()); + return base::Status::OK(); +} + base::Status ConvertCreateIndexStatement(const zetasql::ASTCreateIndexStatement* root, node::NodeManager* node_manager, node::CreateIndexNode** output) { CHECK_TRUE(nullptr != root, common::kSqlAstError, "not an ASTCreateIndexStatement") @@ -2191,6 +2241,7 @@ static const absl::flat_hash_map showTargetMap {"TABLE STATUS", {node::CmdType::kCmdShowTableStatus, false, true}}, {"FUNCTIONS", {node::CmdType::kCmdShowFunctions}}, {"JOBLOG", {node::CmdType::kCmdShowJobLog, true}}, + {"CURRENT_USER", {node::CmdType::kCmdShowUser}}, }; static const absl::flat_hash_map SHOW_STMT_TYPE_MAP = { @@ -2307,6 +2358,19 @@ base::Status ConvertASTType(const zetasql::ASTType* ast_type, node::NodeManager* }))); break; } + case zetasql::AST_MAP_TYPE: { + CHECK_STATUS((ConvertGuard( + ast_type, nm, output, + [](const zetasql::ASTMapType* map_tp, node::NodeManager* nm, node::TypeNode** out) -> base::Status { + node::TypeNode* key = nullptr; + node::TypeNode* value = nullptr; + CHECK_STATUS(ConvertASTType(map_tp->key_type(), nm, &key)); + CHECK_STATUS(ConvertASTType(map_tp->value_type(), nm, &value)); + *out = nm->MakeNode(key, value); + return base::Status::OK(); + }))); + break; + } default: { return base::Status(common::kSqlAstError, "Un-support type: " + ast_type->GetNodeKindString()); } @@ -2352,6 +2416,21 @@ base::Status convertAlterAction(const zetasql::ASTAlterAction* action, node::Nod *out = ac; break; } + case zetasql::AST_SET_OPTIONS_ACTION: { + node::SetOptionsAction* ac = nullptr; + CHECK_STATUS(ConvertGuard( + action, nm, &ac, + [](const zetasql::ASTSetOptionsAction* in, node::NodeManager* nm, node::SetOptionsAction** out) { + auto options = std::make_shared(); + if (in->options_list() != nullptr) { + CHECK_STATUS(ConvertAstOptionsListToMap(in->options_list(), nm, options)); + } + *out = nm->MakeObj(options); + return base::Status::OK(); + })); + *out = ac; + break; + } default: FAIL_STATUS(common::kUnsupportSql, action->SingleNodeDebugString()); } @@ -2406,5 +2485,82 @@ base::Status ConvertSetOperation(const zetasql::ASTSetOperation* set_op, node::N } } +base::Status ConvertSchemaNode(const zetasql::ASTColumnSchema* stmt, node::NodeManager* nm, + node::ColumnSchemaNode** out) { + auto not_null_columns = + stmt->FindAttributes(zetasql::AST_NOT_NULL_COLUMN_ATTRIBUTE); + bool not_null = !not_null_columns.empty(); + + node::ExprNode* default_value = nullptr; + if (stmt->default_expression()) { + CHECK_STATUS(ConvertExprNode(stmt->default_expression(), nm, &default_value)); + } + + switch (stmt->node_kind()) { + case zetasql::AST_SIMPLE_COLUMN_SCHEMA: { + auto simple_column_schema = stmt->GetAsOrNull(); + CHECK_TRUE(simple_column_schema != nullptr, common::kSqlAstError, "not and ASTSimpleColumnSchema"); + + std::string type_name = ""; + CHECK_STATUS(AstPathExpressionToString(simple_column_schema->type_name(), &type_name)) + node::DataType type; + CHECK_STATUS(node::StringToDataType(type_name, &type)); + + *out = nm->MakeNode(type, not_null, default_value); + break; + } + case zetasql::AST_ARRAY_COLUMN_SCHEMA: { + CHECK_STATUS((ConvertGuard( + stmt, nm, out, + [not_null, default_value](const zetasql::ASTArrayColumnSchema* array_type, node::NodeManager* nm, + node::ColumnSchemaNode** out) -> base::Status { + node::ColumnSchemaNode* element_ty = nullptr; + CHECK_STATUS(ConvertSchemaNode(array_type->element_schema(), nm, &element_ty)); + + *out = nm->MakeNode( + node::DataType::kArray, std::initializer_list{element_ty}, + not_null, default_value); + return base::Status::OK(); + }))); + break; + } + case zetasql::AST_MAP_COLUMN_SCHEMA: { + CHECK_STATUS((ConvertGuard( + stmt, nm, out, + [not_null, default_value](const zetasql::ASTMapColumnSchema* map_type, node::NodeManager* nm, + node::ColumnSchemaNode** out) -> base::Status { + node::ColumnSchemaNode* key = nullptr; + CHECK_STATUS(ConvertSchemaNode(map_type->key_schema(), nm, &key)); + node::ColumnSchemaNode* value = nullptr; + CHECK_STATUS(ConvertSchemaNode(map_type->value_schema(), nm, &value)); + + *out = nm->MakeNode( + node::DataType::kMap, std::initializer_list{key, value}, + not_null, default_value); + return base::Status::OK(); + }))); + break; + } + default: { + return base::Status(common::kSqlAstError, + absl::StrCat("unsupported column schema type: ", stmt->GetNodeKindString())); + } + } + + return base::Status::OK(); +} + +base::Status ConvertArrayElement(const zetasql::ASTArrayElement* expr, node::NodeManager* nm, + node::ArrayElementExpr** out) { + node::ExprNode* array = nullptr; + node::ExprNode* pos = nullptr; + + CHECK_STATUS(ConvertExprNode(expr->array(), nm, &array)); + CHECK_STATUS(ConvertExprNode(expr->position(), nm, &pos)); + + *out = nm->MakeNode(array, pos); + return {}; +} + } // namespace plan } // namespace hybridse diff --git a/hybridse/src/planv2/ast_node_converter.h b/hybridse/src/planv2/ast_node_converter.h index e85c6cf8487..a40bacc2e10 100644 --- a/hybridse/src/planv2/ast_node_converter.h +++ b/hybridse/src/planv2/ast_node_converter.h @@ -66,6 +66,12 @@ base::Status ConvertInExpr(const zetasql::ASTInExpression* in_expr, node::NodeMa base::Status ConvertLimitOffsetNode(const zetasql::ASTLimitOffset* limit_offset, node::NodeManager* node_manager, node::SqlNode** output); +base::Status ConvertCreateUserStatement(const zetasql::ASTCreateUserStatement* root, node::NodeManager* node_manager, + node::CreateUserNode** output); + +base::Status ConvertAlterUserStatement(const zetasql::ASTAlterUserStatement* root, node::NodeManager* node_manager, + node::AlterUserNode** output); + base::Status ConvertQueryNode(const zetasql::ASTQuery* root, node::NodeManager* node_manager, node::QueryNode** output); base::Status ConvertQueryExpr(const zetasql::ASTQueryExpression* query_expr, node::NodeManager* node_manager, diff --git a/hybridse/src/planv2/ast_node_converter_test.cc b/hybridse/src/planv2/ast_node_converter_test.cc index 51447011f78..9798e69bd4f 100644 --- a/hybridse/src/planv2/ast_node_converter_test.cc +++ b/hybridse/src/planv2/ast_node_converter_test.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include "absl/strings/match.h" #include "case/sql_case.h" @@ -945,20 +946,6 @@ TEST_F(ASTNodeConverterTest, ConvertCreateTableNodeErrorTest) { auto status = ConvertCreateTableNode(create_stmt, &node_manager, &output); EXPECT_EQ(common::kTypeError, status.code); } - { - // not supported schema - const std::string sql = "create table t (a Array) "; - - std::unique_ptr parser_output; - ZETASQL_ASSERT_OK(zetasql::ParseStatement(sql, zetasql::ParserOptions(), &parser_output)); - const auto* statement = parser_output->statement(); - ASSERT_TRUE(statement->Is()); - - const auto create_stmt = statement->GetAsOrDie(); - node::CreateStmt* output = nullptr; - auto status = ConvertCreateTableNode(create_stmt, &node_manager, &output); - EXPECT_EQ(common::kSqlAstError, status.code); - } { // not supported table element const std::string sql = "create table t (a int64, primary key (a)) "; @@ -1206,6 +1193,10 @@ INSTANTIATE_TEST_SUITE_P(ASTHWindowQueryTest, ASTNodeConverterTest, testing::ValuesIn(sqlcase::InitCases("cases/plan/window_query.yaml", FILTERS))); INSTANTIATE_TEST_SUITE_P(ASTUnionQueryTest, ASTNodeConverterTest, testing::ValuesIn(sqlcase::InitCases("cases/plan/union_query.yaml", FILTERS))); +INSTANTIATE_TEST_SUITE_P(ASTAlterTest, ASTNodeConverterTest, + testing::ValuesIn(sqlcase::InitCases("cases/plan/alter.yaml", FILTERS))); +INSTANTIATE_TEST_SUITE_P(ASTConstQueryTest, ASTNodeConverterTest, + testing::ValuesIn(sqlcase::InitCases("cases/plan/const_query.yaml", FILTERS))); } // namespace plan } // namespace hybridse diff --git a/hybridse/src/planv2/plan_api.cc b/hybridse/src/planv2/plan_api.cc index affe2ca80f0..d3f8f7644bf 100644 --- a/hybridse/src/planv2/plan_api.cc +++ b/hybridse/src/planv2/plan_api.cc @@ -16,13 +16,36 @@ #include "plan/plan_api.h" #include "planv2/planner_v2.h" +#include "zetasql/parser/parser.h" #include "zetasql/public/error_helpers.h" #include "zetasql/public/error_location.pb.h" namespace hybridse { namespace plan { -using hybridse::plan::SimplePlannerV2; +base::Status PlanAPI::CreatePlanTreeFromScript(vm::SqlContext *ctx) { + zetasql::ParserOptions parser_opts; + zetasql::LanguageOptions language_opts; + language_opts.EnableLanguageFeature(zetasql::FEATURE_V_1_3_COLUMN_DEFAULT_VALUE); + parser_opts.set_language_options(&language_opts); + // save parse result into SqlContext so SQL engine can reference fields inside ASTNode during whole compile stage + auto zetasql_status = + zetasql::ParseScript(ctx->sql, parser_opts, zetasql::ERROR_MESSAGE_MULTI_LINE_WITH_CARET, &ctx->ast_node); + zetasql::ErrorLocation location; + if (!zetasql_status.ok()) { + zetasql::ErrorLocation location; + GetErrorLocation(zetasql_status, &location); + return {common::kSyntaxError, zetasql::FormatError(zetasql_status)}; + } + + DLOG(INFO) << "AST Node:\n" << ctx->ast_node->script()->DebugString(); + + const zetasql::ASTScript *script = ctx->ast_node->script(); + auto planner_ptr = + std::make_unique(&ctx->nm, ctx->engine_mode == vm::kBatchMode, ctx->is_cluster_optimized, + ctx->enable_batch_window_parallelization, ctx->options.get()); + return planner_ptr->CreateASTScriptPlan(script, ctx->logical_plan); +} bool PlanAPI::CreatePlanTreeFromScript(const std::string &sql, PlanNodeList &plan_trees, NodeManager *node_manager, Status &status, bool is_batch_mode, bool is_cluster, diff --git a/hybridse/src/planv2/planner_v2.h b/hybridse/src/planv2/planner_v2.h index 46627f10a90..2555ffd66e2 100644 --- a/hybridse/src/planv2/planner_v2.h +++ b/hybridse/src/planv2/planner_v2.h @@ -35,12 +35,12 @@ using node::PlanNodeList; class SimplePlannerV2 : public SimplePlanner { public: - explicit SimplePlannerV2(node::NodeManager *manager) : SimplePlanner(manager, true, false, false) {} SimplePlannerV2(node::NodeManager *manager, bool is_batch_mode, bool is_cluster_optimized = false, bool enable_batch_window_parallelization = false, const std::unordered_map *extra_options = nullptr) : SimplePlanner(manager, is_batch_mode, is_cluster_optimized, enable_batch_window_parallelization, extra_options) {} + base::Status CreateASTScriptPlan(const zetasql::ASTScript *script, PlanNodeList &plan_trees); // NOLINT (runtime/references) }; diff --git a/hybridse/src/sdk/hybridse_interface_core.i b/hybridse/src/sdk/hybridse_interface_core.i index 660f9bac7a1..9c053b69b71 100644 --- a/hybridse/src/sdk/hybridse_interface_core.i +++ b/hybridse/src/sdk/hybridse_interface_core.i @@ -118,6 +118,7 @@ SWIG_JAVABODY_PROXY(public, public, SWIGTYPE) #include "base/iterator.h" #include "vm/catalog.h" #include "vm/engine.h" +#include "vm/sql_ctx.h" #include "vm/engine_context.h" #include "vm/sql_compiler.h" #include "vm/jit_wrapper.h" @@ -140,6 +141,7 @@ using hybridse::vm::WindowOp; using hybridse::vm::EngineMode; using hybridse::vm::EngineOptions; using hybridse::vm::IndexHintHandler; +using hybridse::vm::SqlContext; using hybridse::base::Iterator; using hybridse::base::ConstIterator; using hybridse::base::Trace; diff --git a/hybridse/src/testing/engine_test_base.cc b/hybridse/src/testing/engine_test_base.cc index 7d02528b5ce..3aebea8f2de 100644 --- a/hybridse/src/testing/engine_test_base.cc +++ b/hybridse/src/testing/engine_test_base.cc @@ -409,7 +409,7 @@ Status EngineTestRunner::Compile() { DLOG(INFO) << "Physical plan:\n" << oss.str(); std::ostringstream runner_oss; - std::dynamic_pointer_cast(session_->GetCompileInfo())->GetClusterJob().Print(runner_oss, ""); + std::dynamic_pointer_cast(session_->GetCompileInfo())->GetClusterJob()->Print(runner_oss, ""); DLOG(INFO) << "Runner plan:\n" << runner_oss.str(); } return status; diff --git a/hybridse/src/udf/default_defs/map_defs.cc b/hybridse/src/udf/default_defs/map_defs.cc new file mode 100644 index 00000000000..c1cae3e554c --- /dev/null +++ b/hybridse/src/udf/default_defs/map_defs.cc @@ -0,0 +1,123 @@ +/** + * Copyright (c) 2023 4Paradigm Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "codegen/map_ir_builder.h" +#include "codegen/ir_base_builder.h" +#include "node/expr_node.h" +#include "node/type_node.h" +#include "udf/default_udf_library.h" +#include "udf/udf_registry.h" + +namespace hybridse { +namespace udf { + +void DefaultUdfLibrary::InitMapUdfs() { + RegisterCodeGenUdf("map") + .variadic_args<>( + // infer + [](UdfResolveContext* ctx, const std::vector& arg_attrs, + ExprAttrNode* out) -> base::Status { + auto ret = node::MapType::InferMapType(ctx->node_manager(), arg_attrs); + CHECK_TRUE(ret.ok(), common::kTypeError, ret.status().ToString()); + out->SetType(ret.value()); + out->SetNullable(true); + return {}; + }, + // gen + [](codegen::CodeGenContext* ctx, const std::vector& args, + const ExprAttrNode& return_info, codegen::NativeValue* out) -> base::Status { + CHECK_TRUE(return_info.type()->IsMap(), common::kTypeError, "not a map type output"); + auto* map_type = return_info.type()->GetAsOrNull(); + CHECK_TRUE(map_type != nullptr, common::kTypeError, "can not cast to MapType"); + + ::llvm::Type* key_type = nullptr; + ::llvm::Type* value_type = nullptr; + CHECK_TRUE(codegen::GetLlvmType(ctx->GetModule(), map_type->key_type(), &key_type), + common::kCodegenError); + CHECK_TRUE(codegen::GetLlvmType(ctx->GetModule(), map_type->value_type(), &value_type), + common::kCodegenError); + codegen::MapIRBuilder builder(ctx->GetModule(), key_type, value_type); + auto res = builder.Construct(ctx, args); + if (res.ok()) { + *out = res.value(); + return {}; + } + return {common::kCodegenError, res.status().ToString()}; + }) + .doc(R"( + @brief map(key1, value1, key2, value2, ...) - Creates a map with the given key/value pairs. + + Example: + + @code{.sql} + select map(1, '1', 2, '2'); + -- {1: "1", 2: "2"} + @endcode + + @since 0.9.0 + )"); + + RegisterCodeGenUdf("map_keys") + .args( + [](UdfResolveContext* ctx, const ExprAttrNode& in, ExprAttrNode* out) -> base::Status { + CHECK_TRUE(in.type()->IsMap(), common::kTypeError, "map_keys requires a map data type, got ", + in.type()->DebugString()); + + auto map_type = in.type()->GetAsOrNull(); + CHECK_TRUE(map_type != nullptr, common::kTypeError); + + out->SetType(ctx->node_manager()->MakeNode(node::kArray, map_type->key_type())); + out->SetNullable(true); + return {}; + }, + [](codegen::CodeGenContext* ctx, codegen::NativeValue in, const node::ExprAttrNode& return_info, + codegen::NativeValue* out) -> base::Status { + const node::TypeNode* type = nullptr; + CHECK_TRUE(codegen::GetFullType(ctx->node_manager(), in.GetType(), &type), common::kTypeError); + auto map_type = type->GetAsOrNull(); + CHECK_TRUE(map_type != nullptr, common::kTypeError); + + ::llvm::Type* key_type = nullptr; + ::llvm::Type* value_type = nullptr; + CHECK_TRUE(codegen::GetLlvmType(ctx->GetModule(), map_type->key_type(), &key_type), + common::kCodegenError); + CHECK_TRUE(codegen::GetLlvmType(ctx->GetModule(), map_type->value_type(), &value_type), + common::kCodegenError); + codegen::MapIRBuilder builder(ctx->GetModule(), key_type, value_type); + + auto res = builder.MapKeys(ctx, in); + if (res.ok()) { + *out = res.value(); + return {}; + } + return {common::kCodegenError, res.status().ToString()}; + }) + .doc(R"( + @brief map_keys(map) - Returns an unordered array containing the keys of the map. + + Example: + + @code{.sql} + select map_keys(map(1, '2', 3, '4')); + -- [1, 3] + @endcode + + @since 0.9.0 + )"); +} + +} // namespace udf +} // namespace hybridse diff --git a/hybridse/src/udf/default_udf_library.cc b/hybridse/src/udf/default_udf_library.cc index e6a546095ec..265a1e09250 100644 --- a/hybridse/src/udf/default_udf_library.cc +++ b/hybridse/src/udf/default_udf_library.cc @@ -665,6 +665,7 @@ void DefaultUdfLibrary::Init() { InitFeatureZero(); InitArrayUdfs(); + InitMapUdfs(); InitEarthDistanceUdf(); InitJsonUdfs(); @@ -794,7 +795,7 @@ void DefaultUdfLibrary::InitStringUdf() { RegisterCodeGenUdf("concat").variadic_args<>( /* infer */ [](UdfResolveContext* ctx, - const std::vector& arg_attrs, + const std::vector& arg_attrs, ExprAttrNode* out) { out->SetType(ctx->node_manager()->MakeTypeNode(node::kVarchar)); out->SetNullable(false); @@ -802,7 +803,7 @@ void DefaultUdfLibrary::InitStringUdf() { }, /* gen */ [](CodeGenContext* ctx, const std::vector& args, - NativeValue* out) { + const ExprAttrNode& return_info, NativeValue* out) { codegen::StringIRBuilder string_ir_builder(ctx->GetModule()); return string_ir_builder.Concat(ctx->GetCurrentBlock(), args, out); }) @@ -821,16 +822,16 @@ void DefaultUdfLibrary::InitStringUdf() { RegisterCodeGenUdf("concat_ws") .variadic_args( /* infer */ - [](UdfResolveContext* ctx, const ExprAttrNode* arg, - const std::vector& arg_types, + [](UdfResolveContext* ctx, const ExprAttrNode& arg, + const std::vector& arg_types, ExprAttrNode* out) { out->SetType(ctx->node_manager()->MakeTypeNode(node::kVarchar)); out->SetNullable(false); return Status::OK(); }, /* gen */ - [](CodeGenContext* ctx, NativeValue arg, - const std::vector& args, NativeValue* out) { + [](CodeGenContext* ctx, NativeValue arg, const std::vector& args, + const ExprAttrNode& return_info, NativeValue* out) { codegen::StringIRBuilder string_ir_builder(ctx->GetModule()); return string_ir_builder.ConcatWS(ctx->GetCurrentBlock(), arg, @@ -1651,7 +1652,7 @@ void DefaultUdfLibrary::InitMathUdf() { RegisterExprUdf("round") .variadic_args( - [](UdfResolveContext* ctx, ExprNode* x, const std::vector& other) -> ExprNode* { + [](UdfResolveContext* ctx, ExprNode* x, absl::Span other) -> ExprNode* { if (!x->GetOutputType()->IsArithmetic() || x->GetOutputType()->IsBool()) { ctx->SetError("round do not support first parameter of type " + x->GetOutputType()->GetName()); return nullptr; @@ -2233,18 +2234,15 @@ void DefaultUdfLibrary::InitTimeAndDateUdf() { )"); RegisterCodeGenUdf("year") - .args( - [](CodeGenContext* ctx, NativeValue date, NativeValue* out) { - codegen::DateIRBuilder date_ir_builder(ctx->GetModule()); - ::llvm::Value* ret = nullptr; - Status status; - CHECK_TRUE(date_ir_builder.Year(ctx->GetCurrentBlock(), - date.GetRaw(), &ret, status), - kCodegenError, - "Fail to build udf year(date): ", status.str()); - *out = NativeValue::Create(ret); - return status; - }) + .args([](CodeGenContext* ctx, NativeValue date, const node::ExprAttrNode& return_info, NativeValue* out) { + codegen::DateIRBuilder date_ir_builder(ctx->GetModule()); + ::llvm::Value* ret = nullptr; + Status status; + CHECK_TRUE(date_ir_builder.Year(ctx->GetCurrentBlock(), date.GetRaw(), &ret, status), kCodegenError, + "Fail to build udf year(date): ", status.str()); + *out = NativeValue::Create(ret); + return status; + }) .returns(); RegisterExternal("month") @@ -2264,7 +2262,7 @@ void DefaultUdfLibrary::InitTimeAndDateUdf() { RegisterCodeGenUdf("month") .args( - [](CodeGenContext* ctx, NativeValue date, NativeValue* out) { + [](CodeGenContext* ctx, NativeValue date, const node::ExprAttrNode& ri, NativeValue* out) { codegen::DateIRBuilder date_ir_builder(ctx->GetModule()); ::llvm::Value* ret = nullptr; Status status; @@ -2298,7 +2296,7 @@ void DefaultUdfLibrary::InitTimeAndDateUdf() { )"); RegisterCodeGenUdf("dayofmonth").args( - [](CodeGenContext* ctx, NativeValue date, NativeValue* out) { + [](CodeGenContext* ctx, NativeValue date, const node::ExprAttrNode& ri, NativeValue* out) { codegen::DateIRBuilder date_ir_builder(ctx->GetModule()); ::llvm::Value* ret = nullptr; Status status; @@ -2554,13 +2552,13 @@ void DefaultUdfLibrary::InitTimeAndDateUdf() { .variadic_args<>( /* infer */ [](UdfResolveContext* ctx, - const std::vector& args, + const std::vector& args, ExprAttrNode* out) { auto nm = ctx->node_manager(); auto tuple_type = nm->MakeTypeNode(node::kTuple); for (auto attr : args) { - tuple_type->generics_.push_back(attr->type()); - tuple_type->generics_nullable_.push_back(attr->nullable()); + tuple_type->generics_.push_back(attr.type()); + tuple_type->generics_nullable_.push_back(attr.nullable()); } out->SetType(tuple_type); out->SetNullable(false); @@ -2568,7 +2566,7 @@ void DefaultUdfLibrary::InitTimeAndDateUdf() { }, /* gen */ [](CodeGenContext* ctx, const std::vector& args, - NativeValue* out) { + const ExprAttrNode& return_info, NativeValue* out) { *out = NativeValue::CreateTuple(args); return Status::OK(); }); diff --git a/hybridse/src/udf/default_udf_library.h b/hybridse/src/udf/default_udf_library.h index be5ed6c2414..92152649fa0 100644 --- a/hybridse/src/udf/default_udf_library.h +++ b/hybridse/src/udf/default_udf_library.h @@ -52,6 +52,9 @@ class DefaultUdfLibrary : public UdfLibrary { // Array Udf defines, udfs either accept array as parameter or returns array void InitArrayUdfs(); + // Map functions + void InitMapUdfs(); + // aggregate functions for statistic void InitStatisticsUdafs(); diff --git a/hybridse/src/udf/dynamic_lib_manager.cc b/hybridse/src/udf/dynamic_lib_manager.cc index c6a034247cd..b3b281a0346 100644 --- a/hybridse/src/udf/dynamic_lib_manager.cc +++ b/hybridse/src/udf/dynamic_lib_manager.cc @@ -19,6 +19,8 @@ #include #include +#include "glog/logging.h" + namespace hybridse { namespace udf { diff --git a/hybridse/src/udf/literal_traits.h b/hybridse/src/udf/literal_traits.h index 13c876951e8..2c79c8a365d 100644 --- a/hybridse/src/udf/literal_traits.h +++ b/hybridse/src/udf/literal_traits.h @@ -18,15 +18,12 @@ #define HYBRIDSE_SRC_UDF_LITERAL_TRAITS_H_ #include -#include #include #include #include -#include #include #include -#include "base/fe_status.h" #include "base/string_ref.h" #include "base/type.h" #include "codec/fe_row_codec.h" @@ -139,8 +136,10 @@ static bool operator==(const Nullable& x, const Nullable& y) { // ===================================== // // ArrayRef // ===================================== // -template ::CCallArgType> +template struct ArrayRef { + using CType = typename DataTypeTrait::CCallArgType; + CType* raw; bool* nullables; uint64_t size; diff --git a/hybridse/src/udf/udf_registry.cc b/hybridse/src/udf/udf_registry.cc index 932174d8145..60e93460c24 100644 --- a/hybridse/src/udf/udf_registry.cc +++ b/hybridse/src/udf/udf_registry.cc @@ -206,20 +206,17 @@ Status ExprUdfRegistry::ResolveFunction(UdfResolveContext* ctx, Status LlvmUdfRegistry::ResolveFunction(UdfResolveContext* ctx, node::FnDefNode** result) { std::vector arg_types; - std::vector arg_attrs; + std::vector arg_attrs; for (size_t i = 0; i < ctx->arg_size(); ++i) { auto arg_type = ctx->arg_type(i); bool nullable = ctx->arg_nullable(i); CHECK_TRUE(arg_type != nullptr, kCodegenError, i, "th argument node type is unknown: ", name()); arg_types.push_back(arg_type); - arg_attrs.push_back(new ExprAttrNode(arg_type, nullable)); + arg_attrs.emplace_back(arg_type, nullable); } ExprAttrNode out_attr(nullptr, true); auto status = gen_impl_func_->infer(ctx, arg_attrs, &out_attr); - for (auto ptr : arg_attrs) { - delete const_cast(ptr); - } CHECK_STATUS(status, "Infer llvm output attr failed: ", status.str()); auto return_type = out_attr.type(); diff --git a/hybridse/src/udf/udf_registry.h b/hybridse/src/udf/udf_registry.h index 3ea96d25c13..d9512e581f0 100644 --- a/hybridse/src/udf/udf_registry.h +++ b/hybridse/src/udf/udf_registry.h @@ -28,13 +28,11 @@ #include #include "base/fe_status.h" -#include "codec/list_iterator_codec.h" #include "codegen/context.h" #include "node/node_manager.h" #include "node/sql_node.h" #include "udf/literal_traits.h" #include "udf/udf_library.h" -#include "vm/schemas_context.h" namespace hybridse { namespace udf { @@ -394,10 +392,11 @@ class LlvmUdfGenBase { public: virtual Status gen(codegen::CodeGenContext* ctx, const std::vector& args, + const ExprAttrNode& return_info, codegen::NativeValue* res) = 0; virtual Status infer(UdfResolveContext* ctx, - const std::vector& args, + const std::vector& args, ExprAttrNode*) = 0; node::TypeNode* fixed_ret_type() const { return fixed_ret_type_; } @@ -417,33 +416,36 @@ struct LlvmUdfGen : public LlvmUdfGenBase { using FType = std::function::second_type..., + const ExprAttrNode& return_info, codegen::NativeValue*)>; using InferFType = std::function::second_type..., + typename std::pair::second_type..., ExprAttrNode*)>; Status gen(codegen::CodeGenContext* ctx, const std::vector& args, + const ExprAttrNode& return_info, codegen::NativeValue* result) override { CHECK_TRUE(args.size() == sizeof...(Args), common::kCodegenError, "Fail to invoke LlvmUefGen::gen, args size do not " "match with template args)"); - return gen_internal(ctx, args, result, + return gen_internal(ctx, args, return_info, result, std::index_sequence_for()); } template Status gen_internal(codegen::CodeGenContext* ctx, const std::vector& args, + const ExprAttrNode& return_info, codegen::NativeValue* result, const std::index_sequence&) { - return gen_func(ctx, args[I]..., result); + return gen_func(ctx, args[I]..., return_info, result); } Status infer(UdfResolveContext* ctx, - const std::vector& args, + const std::vector& args, ExprAttrNode* out) override { return infer_internal(ctx, args, out, std::index_sequence_for()); @@ -451,7 +453,7 @@ struct LlvmUdfGen : public LlvmUdfGenBase { template Status infer_internal(UdfResolveContext* ctx, - const std::vector& args, + const std::vector& args, ExprAttrNode* out, const std::index_sequence&) { if (this->infer_func) { return infer_func(ctx, args[I]..., out); @@ -475,39 +477,39 @@ struct LlvmUdfGen : public LlvmUdfGenBase { template struct VariadicLLVMUdfGen : public LlvmUdfGenBase { using FType = std::function::second_type..., - const std::vector&, codegen::NativeValue*)>; + codegen::CodeGenContext*, typename std::pair::second_type..., + const std::vector&, const ExprAttrNode& return_info, codegen::NativeValue*)>; using InferFType = std::function::second_type..., - const std::vector&, ExprAttrNode*)>; + typename std::pair::second_type..., + const std::vector&, ExprAttrNode*)>; Status gen(codegen::CodeGenContext* ctx, const std::vector& args, + const ExprAttrNode& return_info, codegen::NativeValue* result) override { CHECK_TRUE(args.size() >= sizeof...(Args), common::kCodegenError, "Fail to invoke VariadicLLVMUdfGen::gen, " "args size do not match with template args)"); - return gen_internal(ctx, args, result, - std::index_sequence_for()); + return gen_internal(ctx, args, return_info, result, std::index_sequence_for()); }; template Status gen_internal(codegen::CodeGenContext* ctx, const std::vector& args, + const ExprAttrNode& return_info, codegen::NativeValue* result, const std::index_sequence&) { std::vector variadic_args; for (size_t i = sizeof...(I); i < args.size(); ++i) { variadic_args.emplace_back(args[i]); } - return this->gen_func(ctx, args[I]..., variadic_args, result); + return this->gen_func(ctx, args[I]..., variadic_args, return_info, result); } Status infer(UdfResolveContext* ctx, - const std::vector& args, + const std::vector& args, ExprAttrNode* out) override { return infer_internal(ctx, args, out, std::index_sequence_for()); @@ -515,9 +517,9 @@ struct VariadicLLVMUdfGen : public LlvmUdfGenBase { template Status infer_internal(UdfResolveContext* ctx, - const std::vector& args, + const std::vector& args, ExprAttrNode* out, const std::index_sequence&) { - std::vector variadic_args; + std::vector variadic_args; for (size_t i = sizeof...(I); i < args.size(); ++i) { variadic_args.emplace_back(args[i]); } @@ -723,9 +725,8 @@ class CodeGenUdfTemplateRegistryHelper { LlvmUdfRegistryHelper& helper) { // NOLINT helper.args( [](codegen::CodeGenContext* ctx, - typename std::pair< - Args, codegen::NativeValue>::second_type... args, - codegen::NativeValue* result) { + typename std::pair::second_type... args, + const ExprAttrNode& return_info, codegen::NativeValue* result) { return FTemplate()(ctx, args..., result); }); return helper.cur_def(); diff --git a/hybridse/src/udf/udf_registry_test.cc b/hybridse/src/udf/udf_registry_test.cc index 962b367819b..aac28fc8f17 100644 --- a/hybridse/src/udf/udf_registry_test.cc +++ b/hybridse/src/udf/udf_registry_test.cc @@ -384,14 +384,14 @@ TEST_F(UdfRegistryTest, test_codegen_udf_register) { library.RegisterCodeGenUdf("add").args( /* infer */ - [](UdfResolveContext* ctx, const ExprAttrNode* x, const ExprAttrNode* y, + [](UdfResolveContext* ctx, const ExprAttrNode& x, const ExprAttrNode& y, ExprAttrNode* out) { - out->SetType(x->type()); + out->SetType(x.type()); return Status::OK(); }, /* gen */ [](CodeGenContext* ctx, NativeValue x, NativeValue y, - NativeValue* out) { + const ExprAttrNode& ri, NativeValue* out) { *out = x; return Status::OK(); }); @@ -409,14 +409,14 @@ TEST_F(UdfRegistryTest, test_variadic_codegen_udf_register) { library.RegisterCodeGenUdf("concat").variadic_args<>( /* infer */ [](UdfResolveContext* ctx, - const std::vector& arg_attrs, + const std::vector& arg_attrs, ExprAttrNode* out) { - out->SetType(arg_attrs[0]->type()); + out->SetType(arg_attrs[0].type()); return Status::OK(); }, /* gen */ [](CodeGenContext* ctx, const std::vector& args, - NativeValue* out) { + const ExprAttrNode& return_info, NativeValue* out) { *out = args[0]; return Status::OK(); }); diff --git a/hybridse/src/vm/engine.cc b/hybridse/src/vm/engine.cc index c0d9be8c333..0865655f3c1 100644 --- a/hybridse/src/vm/engine.cc +++ b/hybridse/src/vm/engine.cc @@ -160,7 +160,7 @@ bool Engine::Get(const std::string& sql, const std::string& db, RunSession& sess sql_context.enable_expr_optimize = options_.IsEnableExprOptimize(); sql_context.jit_options = options_.jit_options(); sql_context.options = session.GetOptions(); - sql_context.index_hints_ = session.index_hints_; + sql_context.index_hints = session.index_hints_; if (session.engine_mode() == kBatchMode) { sql_context.parameter_types = dynamic_cast(&session)->GetParameterSchema(); } else if (session.engine_mode() == kBatchRequestMode) { @@ -191,7 +191,7 @@ bool Engine::Get(const std::string& sql, const std::string& db, RunSession& sess LOG(INFO) << "physical plan:\n" << plan_oss.str() << std::endl; } std::ostringstream runner_oss; - sql_context.cluster_job.Print(runner_oss, ""); + sql_context.cluster_job->Print(runner_oss, ""); LOG(INFO) << "cluster job:\n" << runner_oss.str() << std::endl; } return true; @@ -377,20 +377,20 @@ bool RunSession::SetCompileInfo(const std::shared_ptr& compile_info int32_t RequestRunSession::Run(const Row& in_row, Row* out_row) { DLOG(INFO) << "Request Row Run with main task"; - return Run(std::dynamic_pointer_cast(compile_info_)->get_sql_context().cluster_job.main_task_id(), + return Run(std::dynamic_pointer_cast(compile_info_)->get_sql_context().cluster_job->main_task_id(), in_row, out_row); } int32_t RequestRunSession::Run(const uint32_t task_id, const Row& in_row, Row* out_row) { auto task = std::dynamic_pointer_cast(compile_info_) ->get_sql_context() - .cluster_job.GetTask(task_id) + .cluster_job->GetTask(task_id) .GetRoot(); if (nullptr == task) { LOG(WARNING) << "fail to run request plan: taskid" << task_id << " not exist!"; return -2; } DLOG(INFO) << "Request Row Run with task_id " << task_id; - RunnerContext ctx(&std::dynamic_pointer_cast(compile_info_)->get_sql_context().cluster_job, in_row, + RunnerContext ctx(std::dynamic_pointer_cast(compile_info_)->get_sql_context().cluster_job, in_row, sp_name_, is_debug_); auto output = task->RunWithCache(ctx); if (!output) { @@ -405,15 +405,15 @@ int32_t RequestRunSession::Run(const uint32_t task_id, const Row& in_row, Row* o } int32_t BatchRequestRunSession::Run(const std::vector& request_batch, std::vector& output) { - return Run(std::dynamic_pointer_cast(compile_info_)->get_sql_context().cluster_job.main_task_id(), + return Run(std::dynamic_pointer_cast(compile_info_)->get_sql_context().cluster_job->main_task_id(), request_batch, output); } int32_t BatchRequestRunSession::Run(const uint32_t id, const std::vector& request_batch, std::vector& output) { - RunnerContext ctx(&std::dynamic_pointer_cast(compile_info_)->get_sql_context().cluster_job, + RunnerContext ctx(std::dynamic_pointer_cast(compile_info_)->get_sql_context().cluster_job, request_batch, sp_name_, is_debug_); auto task = - std::dynamic_pointer_cast(compile_info_)->get_sql_context().cluster_job.GetTask(id).GetRoot(); + std::dynamic_pointer_cast(compile_info_)->get_sql_context().cluster_job->GetTask(id).GetRoot(); if (nullptr == task) { LOG(WARNING) << "Fail to run request plan: taskid" << id << " not exist!"; return -2; @@ -435,8 +435,8 @@ int32_t BatchRunSession::Run(std::vector& rows, uint64_t limit) { } int32_t BatchRunSession::Run(const Row& parameter_row, std::vector& rows, uint64_t limit) { auto& sql_ctx = std::dynamic_pointer_cast(compile_info_)->get_sql_context(); - RunnerContext ctx(&sql_ctx.cluster_job, parameter_row, is_debug_); - auto output = sql_ctx.cluster_job.GetTask(0).GetRoot()->RunWithCache(ctx); + RunnerContext ctx(sql_ctx.cluster_job, parameter_row, is_debug_); + auto output = sql_ctx.cluster_job->GetTask(0).GetRoot()->RunWithCache(ctx); if (!output) { DLOG(INFO) << "Run batch plan output is empty"; return 0; diff --git a/hybridse/src/vm/runner_ctx.h b/hybridse/src/vm/runner_ctx.h index 0924015450a..350d2372a09 100644 --- a/hybridse/src/vm/runner_ctx.h +++ b/hybridse/src/vm/runner_ctx.h @@ -29,8 +29,7 @@ namespace vm { class RunnerContext { public: - explicit RunnerContext(hybridse::vm::ClusterJob* cluster_job, - const hybridse::codec::Row& parameter, + explicit RunnerContext(std::shared_ptr cluster_job, const hybridse::codec::Row& parameter, const bool is_debug = false) : cluster_job_(cluster_job), sp_name_(""), @@ -39,7 +38,7 @@ class RunnerContext { parameter_(parameter), is_debug_(is_debug), batch_cache_() {} - explicit RunnerContext(hybridse::vm::ClusterJob* cluster_job, + explicit RunnerContext(std::shared_ptr cluster_job, const hybridse::codec::Row& request, const std::string& sp_name = "", const bool is_debug = false) @@ -50,7 +49,7 @@ class RunnerContext { parameter_(), is_debug_(is_debug), batch_cache_() {} - explicit RunnerContext(hybridse::vm::ClusterJob* cluster_job, + explicit RunnerContext(std::shared_ptr cluster_job, const std::vector& request_batch, const std::string& sp_name = "", const bool is_debug = false) @@ -68,7 +67,7 @@ class RunnerContext { return requests_[idx]; } const hybridse::codec::Row& GetParameterRow() const { return parameter_; } - hybridse::vm::ClusterJob* cluster_job() { return cluster_job_; } + std::shared_ptr cluster_job() { return cluster_job_; } void SetRequest(const hybridse::codec::Row& request); void SetRequests(const std::vector& requests); bool is_debug() const { return is_debug_; } @@ -81,7 +80,7 @@ class RunnerContext { void SetBatchCache(int64_t id, std::shared_ptr data); private: - hybridse::vm::ClusterJob* cluster_job_; + std::shared_ptr cluster_job_; const std::string sp_name_; hybridse::codec::Row request_; std::vector requests_; diff --git a/hybridse/src/vm/runner_test.cc b/hybridse/src/vm/runner_test.cc index ea8d9c9643e..bce8c8712d3 100644 --- a/hybridse/src/vm/runner_test.cc +++ b/hybridse/src/vm/runner_test.cc @@ -75,13 +75,13 @@ void RunnerCheck(std::shared_ptr catalog, const std::string sql, ASSERT_TRUE(ok) << compile_status; ASSERT_TRUE(sql_compiler.BuildClusterJob(sql_context, compile_status)); ASSERT_TRUE(nullptr != sql_context.physical_plan); - ASSERT_TRUE(sql_context.cluster_job.IsValid()); + ASSERT_TRUE(sql_context.cluster_job->IsValid()); std::ostringstream oss; sql_context.physical_plan->Print(oss, ""); std::cout << "physical plan:\n" << sql << "\n" << oss.str() << std::endl; std::ostringstream runner_oss; - sql_context.cluster_job.Print(runner_oss, ""); + sql_context.cluster_job->Print(runner_oss, ""); std::cout << "runner: \n" << runner_oss.str() << std::endl; std::ostringstream oss_schema; @@ -349,7 +349,7 @@ TEST_F(RunnerTest, KeyGeneratorTest) { ASSERT_TRUE(sql_context.physical_plan != nullptr); auto root = GetFirstRunnerOfType( - sql_context.cluster_job.GetTask(0).GetRoot(), kRunnerGroup); + sql_context.cluster_job->GetTask(0).GetRoot(), kRunnerGroup); auto group_runner = dynamic_cast(root); std::vector rows; hybridse::type::TableDef temp_table; diff --git a/hybridse/src/vm/sql_compiler.cc b/hybridse/src/vm/sql_compiler.cc index c686e1401b4..ea5626545ee 100644 --- a/hybridse/src/vm/sql_compiler.cc +++ b/hybridse/src/vm/sql_compiler.cc @@ -159,7 +159,7 @@ Status SqlCompiler::BuildBatchModePhysicalPlan(SqlContext* ctx, const ::hybridse vm::BatchModeTransformer transformer(&ctx->nm, ctx->db, cl_, &ctx->parameter_types, llvm_module, library, ctx->is_cluster_optimized, ctx->enable_expr_optimize, ctx->enable_batch_window_parallelization, ctx->enable_window_column_pruning, - ctx->options.get(), ctx->index_hints_); + ctx->options.get(), ctx->index_hints); transformer.AddDefaultPasses(); CHECK_STATUS(transformer.TransformPhysicalPlan(plan_list, output), "Fail to generate physical plan batch mode"); ctx->schema = *(*output)->GetOutputSchema(); @@ -172,7 +172,7 @@ Status SqlCompiler::BuildRequestModePhysicalPlan(SqlContext* ctx, const ::hybrid PhysicalOpNode** output) { vm::RequestModeTransformer transformer(&ctx->nm, ctx->db, cl_, &ctx->parameter_types, llvm_module, library, {}, ctx->is_cluster_optimized, false, ctx->enable_expr_optimize, - enable_request_performance_sensitive, ctx->options.get(), ctx->index_hints_); + enable_request_performance_sensitive, ctx->options.get(), ctx->index_hints); if (ctx->options && ctx->options->count(LONG_WINDOWS)) { transformer.AddPass(passes::kPassSplitAggregationOptimized); transformer.AddPass(passes::kPassLongWindowOptimized); @@ -196,7 +196,7 @@ Status SqlCompiler::BuildBatchRequestModePhysicalPlan(SqlContext* ctx, const ::h vm::RequestModeTransformer transformer(&ctx->nm, ctx->db, cl_, &ctx->parameter_types, llvm_module, library, ctx->batch_request_info.common_column_indices, ctx->is_cluster_optimized, ctx->is_batch_request_optimized, ctx->enable_expr_optimize, true, - ctx->options.get(), ctx->index_hints_); + ctx->options.get(), ctx->index_hints); if (ctx->options && ctx->options->count(LONG_WINDOWS)) { transformer.AddPass(passes::kPassSplitAggregationOptimized); transformer.AddPass(passes::kPassLongWindowOptimized); @@ -297,7 +297,10 @@ bool SqlCompiler::BuildClusterJob(SqlContext& ctx, Status& status) { // NOLINT ctx.is_cluster_optimized && is_request_mode, ctx.batch_request_info.common_column_indices, ctx.batch_request_info.common_node_set); - ctx.cluster_job = runner_builder.BuildClusterJob(ctx.physical_plan, status); + if (ctx.cluster_job == nullptr) { + ctx.cluster_job = std::make_shared(); + } + *ctx.cluster_job = runner_builder.BuildClusterJob(ctx.physical_plan, status); return status.isOK(); } @@ -310,11 +313,8 @@ bool SqlCompiler::BuildClusterJob(SqlContext& ctx, Status& status) { // NOLINT */ bool SqlCompiler::Parse(SqlContext& ctx, ::hybridse::base::Status& status) { // NOLINT - bool is_batch_mode = ctx.engine_mode == kBatchMode; - if (!::hybridse::plan::PlanAPI::CreatePlanTreeFromScript(ctx.sql, ctx.logical_plan, &ctx.nm, status, is_batch_mode, - ctx.is_cluster_optimized, - ctx.enable_batch_window_parallelization, - ctx.options.get())) { + status = hybridse::plan::PlanAPI::CreatePlanTreeFromScript(&ctx); + if (!status.isOK()) { LOG(WARNING) << "Fail create sql plan: " << status; return false; } diff --git a/hybridse/src/vm/sql_compiler.h b/hybridse/src/vm/sql_compiler.h index a70f5275276..a874be405fa 100644 --- a/hybridse/src/vm/sql_compiler.h +++ b/hybridse/src/vm/sql_compiler.h @@ -19,7 +19,7 @@ #include #include -#include + #include "base/fe_status.h" #include "llvm/IR/Module.h" #include "udf/udf_library.h" @@ -30,60 +30,13 @@ #include "vm/physical_op.h" #include "vm/physical_plan_context.h" #include "vm/runner.h" +#include "vm/sql_ctx.h" namespace hybridse { namespace vm { using hybridse::base::Status; -struct SqlContext { - // mode: batch|request|batch request - ::hybridse::vm::EngineMode engine_mode; - bool is_cluster_optimized = false; - bool is_batch_request_optimized = false; - bool enable_expr_optimize = false; - bool enable_batch_window_parallelization = true; - bool enable_window_column_pruning = false; - - // the sql content - std::string sql; - // the database - std::string db; - // the logical plan - ::hybridse::node::PlanNodeList logical_plan; - ::hybridse::vm::PhysicalOpNode* physical_plan = nullptr; - hybridse::vm::ClusterJob cluster_job; - // TODO(wangtaize) add a light jit engine - // eg using bthead to compile ir - hybridse::vm::JitOptions jit_options; - std::shared_ptr jit = nullptr; - Schema schema; - Schema request_schema; - std::string request_db_name; - std::string request_name; - Schema parameter_types; - uint32_t row_size; - uint32_t limit_cnt = 0; - std::string ir; - std::string logical_plan_str; - std::string physical_plan_str; - std::string encoded_schema; - std::string encoded_request_schema; - ::hybridse::node::NodeManager nm; - ::hybridse::udf::UdfLibrary* udf_library = nullptr; - - ::hybridse::vm::BatchRequestInfo batch_request_info; - - std::shared_ptr> options; - - // [ALPHA] SQL diagnostic infos - // not standardized, only index hints, no error, no warning, no other hint/info - std::shared_ptr index_hints_; - - SqlContext() {} - ~SqlContext() {} -}; - class SqlCompileInfo : public CompileInfo { public: SqlCompileInfo() : sql_ctx() {} @@ -111,13 +64,13 @@ class SqlCompileInfo : public CompileInfo { const std::string& GetRequestDbName() const override { return sql_ctx.request_db_name; } const hybridse::vm::BatchRequestInfo& GetBatchRequestInfo() const override { return sql_ctx.batch_request_info; } const hybridse::vm::PhysicalOpNode* GetPhysicalPlan() const override { return sql_ctx.physical_plan; } - hybridse::vm::Runner* GetMainTask() { return sql_ctx.cluster_job.GetMainTask().GetRoot(); } - hybridse::vm::ClusterJob& GetClusterJob() { return sql_ctx.cluster_job; } + hybridse::vm::Runner* GetMainTask() { return sql_ctx.cluster_job->GetMainTask().GetRoot(); } + std::shared_ptr GetClusterJob() { return sql_ctx.cluster_job; } void DumpPhysicalPlan(std::ostream& output, const std::string& tab) override { sql_ctx.physical_plan->Print(output, tab); } void DumpClusterJob(std::ostream& output, const std::string& tab) override { - sql_ctx.cluster_job.Print(output, tab); + sql_ctx.cluster_job->Print(output, tab); } static SqlCompileInfo* CastFrom(CompileInfo* node) { return dynamic_cast(node); } diff --git a/hybridse/src/vm/sql_ctx.cc b/hybridse/src/vm/sql_ctx.cc new file mode 100644 index 00000000000..b328801978c --- /dev/null +++ b/hybridse/src/vm/sql_ctx.cc @@ -0,0 +1,29 @@ +/** + * Copyright (c) 2023 OpenMLDB Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "vm/sql_ctx.h" + +// DONT DELETE: unique_ptr requires full specification for underlying type +#include "zetasql/parser/parser.h" // IWYU pragma: keep + +namespace hybridse { +namespace vm { +SqlContext::SqlContext() {} + +SqlContext::~SqlContext() {} + +} // namespace vm +} // namespace hybridse diff --git a/hybridse/src/vm/transform.cc b/hybridse/src/vm/transform.cc index 82e96b3c094..49a76d95273 100644 --- a/hybridse/src/vm/transform.cc +++ b/hybridse/src/vm/transform.cc @@ -26,6 +26,7 @@ #include "codegen/context.h" #include "codegen/fn_ir_builder.h" #include "codegen/fn_let_ir_builder.h" +#include "codegen/ir_base_builder.h" #include "passes/physical/batch_request_optimize.h" #include "passes/physical/cluster_optimized.h" #include "passes/physical/condition_optimized.h" @@ -39,9 +40,9 @@ #include "passes/physical/window_column_pruning.h" #include "plan/planner.h" #include "proto/fe_common.pb.h" +#include "vm/internal/node_helper.h" #include "vm/physical_op.h" #include "vm/schemas_context.h" -#include "vm/internal/node_helper.h" namespace hybridse { namespace vm { diff --git a/java/hybridse-sdk/src/main/java/com/_4paradigm/hybridse/sdk/RequestEngine.java b/java/hybridse-sdk/src/main/java/com/_4paradigm/hybridse/sdk/RequestEngine.java index 6349455a523..ca3008650c7 100644 --- a/java/hybridse-sdk/src/main/java/com/_4paradigm/hybridse/sdk/RequestEngine.java +++ b/java/hybridse-sdk/src/main/java/com/_4paradigm/hybridse/sdk/RequestEngine.java @@ -32,7 +32,7 @@ */ public class RequestEngine implements AutoCloseable { - private static final Logger logger = LoggerFactory.getLogger(SqlEngine.class); + private static final Logger logger = LoggerFactory.getLogger(RequestEngine.class); private SimpleCatalog catalog; private EngineOptions options; diff --git a/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/SparkPlanner.scala b/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/SparkPlanner.scala index 1ab350c42f8..0cf2470dcf8 100644 --- a/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/SparkPlanner.scala +++ b/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/SparkPlanner.scala @@ -393,7 +393,7 @@ class SparkPlanner(session: SparkSession, config: OpenmldbBatchConfig, sparkAppN case e: Exception => println("Get exception: " + e.getMessage) e.printStackTrace() - body(sqlEngine) + throw e } finally { if (sqlEngine != null) { sqlEngine.close() diff --git a/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/api/OpenmldbSession.scala b/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/api/OpenmldbSession.scala index 7059f0146bd..d16496a6111 100755 --- a/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/api/OpenmldbSession.scala +++ b/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/api/OpenmldbSession.scala @@ -18,7 +18,7 @@ package com._4paradigm.openmldb.batch.api import com._4paradigm.openmldb.batch.catalog.OpenmldbCatalogService import com._4paradigm.openmldb.batch.utils.{DataTypeUtil, VersionCli} -import com._4paradigm.openmldb.batch.utils.HybridseUtil.autoLoad +import com._4paradigm.openmldb.batch.utils.DataSourceUtil.autoLoad import com._4paradigm.openmldb.batch.{OpenmldbBatchConfig, SparkPlanner} import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.log4j.{Level, Logger} diff --git a/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/nodes/CreateTablePlan.scala b/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/nodes/CreateTablePlan.scala index 17f325fb909..579195fa257 100644 --- a/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/nodes/CreateTablePlan.scala +++ b/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/nodes/CreateTablePlan.scala @@ -18,7 +18,7 @@ package com._4paradigm.openmldb.batch.nodes import com._4paradigm.hybridse.node.CreateTableLikeClause.LikeKind import com._4paradigm.hybridse.sdk.UnsupportedHybridSeException import com._4paradigm.hybridse.vm.PhysicalCreateTableNode -import com._4paradigm.openmldb.batch.utils.{HybridseUtil, OpenmldbTableUtil} +import com._4paradigm.openmldb.batch.utils.{DataSourceUtil, OpenmldbTableUtil} import com._4paradigm.openmldb.batch.{PlanContext, SparkInstance} import org.slf4j.LoggerFactory @@ -44,7 +44,7 @@ object CreateTablePlan { val df = likeKind match { case LikeKind.HIVE => val hivePath = node.getData_.GetLikePath() - HybridseUtil.autoLoad(ctx.getOpenmldbSession, hivePath, "hive", Map[String, String](), null) + DataSourceUtil.autoLoad(ctx.getOpenmldbSession, hivePath, "hive", Map[String, String](), null) case LikeKind.PARQUET => val parquetPath = node.getData_.GetLikePath() ctx.getSparkSession.read.parquet(parquetPath) diff --git a/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/nodes/LoadDataPlan.scala b/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/nodes/LoadDataPlan.scala index a04b46ab650..7f87c55ffce 100644 --- a/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/nodes/LoadDataPlan.scala +++ b/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/nodes/LoadDataPlan.scala @@ -16,7 +16,7 @@ package com._4paradigm.openmldb.batch.nodes import com._4paradigm.hybridse.vm.PhysicalLoadDataNode -import com._4paradigm.openmldb.batch.utils.HybridseUtil +import com._4paradigm.openmldb.batch.utils.{DataSourceUtil, HybridseUtil} import com._4paradigm.openmldb.batch.{PlanContext, SparkInstance} import com._4paradigm.openmldb.proto.NS.OfflineTableInfo import org.slf4j.LoggerFactory @@ -51,20 +51,23 @@ object LoadDataPlan { // we read input file even in soft copy, // cause we want to check if "the input file schema == openmldb table schema" - val df = HybridseUtil.autoLoad(ctx.getOpenmldbSession, inputFile, format, options, info.getColumnDescList, + val df = DataSourceUtil.autoLoad(ctx.getOpenmldbSession, inputFile, format, options, info.getColumnDescList, loadDataSql) // write - logger.info("write data to storage {}, writer[mode {}], is deep? {}", storage, mode, deepCopy.toString) + logger.info("write data to storage {}, writer mode {}, is deep {}", storage, mode, deepCopy.toString) if (storage == "online") { // Import online data require(deepCopy && mode == "append", "import to online storage, can't do soft copy, and mode must be append") val writeType = extra.get("writer_type").get + val putIfAbsent = extra.get("put_if_absent").get.toBoolean + logger.info(s"online write type ${writeType}, put if absent ${putIfAbsent}") val writeOptions = Map( "db" -> db, "table" -> table, "zkCluster" -> ctx.getConf.openmldbZkCluster, "zkPath" -> ctx.getConf.openmldbZkRootPath, - "writerType" -> writeType + "writerType" -> writeType, + "putIfAbsent" -> putIfAbsent.toString ) df.write.options(writeOptions).format("openmldb").mode(mode).save() } else { // Import offline data diff --git a/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/nodes/SelectIntoPlan.scala b/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/nodes/SelectIntoPlan.scala index 7dcdd51575b..4f366774cd5 100644 --- a/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/nodes/SelectIntoPlan.scala +++ b/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/nodes/SelectIntoPlan.scala @@ -16,7 +16,7 @@ package com._4paradigm.openmldb.batch.nodes import com._4paradigm.hybridse.vm.PhysicalSelectIntoNode -import com._4paradigm.openmldb.batch.utils.{HybridseUtil, OpenmldbTableUtil} +import com._4paradigm.openmldb.batch.utils.{HybridseUtil, OpenmldbTableUtil, DataSourceUtil} import com._4paradigm.openmldb.batch.{PlanContext, SparkInstance} import org.slf4j.LoggerFactory @@ -39,12 +39,12 @@ object SelectIntoPlan { throw new Exception("select empty, skip save") } - if (format == "hive") { + if (DataSourceUtil.isCatalog(format)) { // we won't check if the database exists, if not, save will throw exception - // DO NOT create database in here(the table location will be spark warehouse) - val dbt = HybridseUtil.hiveDest(outPath) - logger.info(s"offline select into: hive way, write mode[${mode}], out table ${dbt}") - input.getDf().write.format("hive").mode(mode).saveAsTable(dbt) + // Hive: DO NOT create database in here(the table location will be spark warehouse) + val dbt = DataSourceUtil.catalogDest(outPath) + logger.info(s"offline select into: $format catalog, write mode[${mode}], out table ${dbt}") + input.getDf().write.format(format).mode(mode).saveAsTable(dbt) } else if (format == "openmldb") { val (db, table) = HybridseUtil.getOpenmldbDbAndTable(outPath) diff --git a/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/utils/DataSourceUtil.scala b/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/utils/DataSourceUtil.scala new file mode 100644 index 00000000000..12de283497c --- /dev/null +++ b/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/utils/DataSourceUtil.scala @@ -0,0 +1,228 @@ +/* + * Copyright 2021 4Paradigm + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com._4paradigm.openmldb.batch.utils + +import java.util +import com._4paradigm.hybridse.`type`.TypeOuterClass.{ColumnDef, Database, TableDef} +import com._4paradigm.hybridse.node.ConstNode +import com._4paradigm.hybridse.sdk.UnsupportedHybridSeException +import com._4paradigm.hybridse.vm.{PhysicalLoadDataNode, PhysicalOpNode, PhysicalSelectIntoNode} +import com._4paradigm.openmldb.batch.api.OpenmldbSession +import com._4paradigm.openmldb.proto +import com._4paradigm.openmldb.proto.Common +import org.apache.spark.sql.catalyst.expressions.UnsafeRow +import org.apache.spark.sql.functions.{col, first} +import org.apache.spark.sql.types.{BooleanType, DataType, DateType, DoubleType, FloatType, IntegerType, LongType, + ShortType, StringType, StructField, StructType, TimestampType} +import org.apache.spark.sql.{DataFrame, DataFrameReader, Row, SparkSession} +import org.slf4j.LoggerFactory + +import scala.collection.JavaConverters.asScalaBufferConverter +import scala.collection.convert.ImplicitConversions.`collection AsScalaIterable` +import scala.collection.mutable + +// util for any data source(defined by format & options) +object DataSourceUtil { + private val logger = LoggerFactory.getLogger(this.getClass) + + def autoLoad(openmldbSession: OpenmldbSession, file: String, format: String, options: Map[String, String], + columns: util.List[Common.ColumnDesc]): DataFrame = { + autoLoad(openmldbSession, file, List.empty[String], format, options, columns, "") + } + + def autoLoad(openmldbSession: OpenmldbSession, file: String, format: String, options: Map[String, String], + columns: util.List[Common.ColumnDesc], loadDataSql: String): DataFrame = { + autoLoad(openmldbSession, file, List.empty[String], format, options, columns, loadDataSql) + } + + // otherwise isCatalog + // hdfs files are csv or parquet + def isFile(format: String): Boolean = { + format.toLowerCase.equals("csv") || format.toLowerCase.equals("parquet") + } + + def isCatalog(format: String): Boolean = { + !isFile(format) + } + + private def checkSchemaIgnoreNullable(actual: StructType, expect: StructType): Boolean = { + actual.zip(expect).forall{case (a, b) => (a.name, a.dataType) == (b.name, b.dataType)} + } + + // Load df from file **and** symbol paths, they should in the same format and options. + // Decide which load method to use by arg `format`, DO NOT pass `hive://a.b` with format `csv`, + // the format should be `hive`. + // Use `parseOptions` in LoadData/SelectInto to get the right format(filePath & option `format`). + // valid pattern: + // 1. catalog: discard other options, format supports hive(just schema.table), + // custom catalog(.scham.table, e.g.iceberg) + // 2. file: local file or hdfs file, format supports csv & parquet, other options take effect + // We use OpenmldbSession for running sparksql in hiveLoad. If in 4pd Spark distribution, SparkSession.sql + // will do openmldbSql first, and if DISABLE_OPENMLDB_FALLBACK, we can't use sparksql. + def autoLoad(openmldbSession: OpenmldbSession, file: String, symbolPaths: List[String], format: String, + options: Map[String, String], columns: util.List[Common.ColumnDesc], loadDataSql: String = "") + : DataFrame = { + val fmt = format.toLowerCase + if (isCatalog(fmt)) { + logger.info(s"load data from catalog table, format $fmt, paths: $file $symbolPaths") + if (file.isEmpty) { + // no file, read all symbol paths + var outputDf: DataFrame = null + symbolPaths.zipWithIndex.foreach { case (path, index) => + if (index == 0) { + outputDf = catalogLoad(openmldbSession, path, columns, loadDataSql) + } else { + outputDf = outputDf.union(catalogLoad(openmldbSession, path, columns, loadDataSql)) + } + } + outputDf + } else { + var outputDf = catalogLoad(openmldbSession, file, columns, loadDataSql) + for (path: String <- symbolPaths) { + outputDf = outputDf.union(catalogLoad(openmldbSession, path, columns, loadDataSql)) + } + outputDf + } + } else { + logger.info("load data from file {} & {} reader[format {}, options {}]", file, symbolPaths, fmt, options) + + if (file.isEmpty) { + var outputDf: DataFrame = null + symbolPaths.zipWithIndex.foreach { case (path, index) => + if (index == 0) { + outputDf = autoFileLoad(openmldbSession, path, fmt, options, columns, loadDataSql) + } else { + outputDf = outputDf.union(autoFileLoad(openmldbSession, path, fmt, options, columns, + loadDataSql)) + } + } + outputDf + } else { + var outputDf = autoFileLoad(openmldbSession, file, fmt, options, columns, loadDataSql) + for (path: String <- symbolPaths) { + outputDf = outputDf.union(autoFileLoad(openmldbSession, path, fmt, options, columns, + loadDataSql)) + } + outputDf + } + } + } + + // We want df with oriSchema, but if the file format is csv: + // 1. we support two format of timestamp + // 2. spark read may change the df schema to all nullable + // So we should fix it. + private def autoFileLoad(openmldbSession: OpenmldbSession, file: String, format: String, + options: Map[String, String], columns: util.List[Common.ColumnDesc], loadDataSql: String): DataFrame = { + require(format.equals("csv") || format.equals("parquet"), s"unsupported format $format") + val reader = openmldbSession.getSparkSession.read.options(options) + + val (oriSchema, readSchema, tsCols) = HybridseUtil.extractOriginAndReadSchema(columns) + var df = if (format.equals("parquet")) { + // When reading Parquet files, all columns are automatically converted to be nullable for compatibility reasons. + // ref https://spark.apache.org/docs/3.2.1/sql-data-sources-parquet.html + val df = if (loadDataSql != null && loadDataSql.nonEmpty) { + reader.format(format).load(file).createOrReplaceTempView("file") + openmldbSession.sparksql(loadDataSql) + } else { + reader.format(format).load(file) + } + + require(checkSchemaIgnoreNullable(df.schema, oriSchema), + s"schema mismatch(ignore nullable), loaded ${df.schema}!= table $oriSchema, check $file") + // reset nullable property + df.sqlContext.createDataFrame(df.rdd, oriSchema) + } else { + // csv should auto detect the timestamp format + reader.format(format) + // use string to read, then infer the format by the first non-null value of the ts column + val longTsCols = HybridseUtil.parseLongTsCols(reader, readSchema, tsCols, file) + logger.info(s"read schema: $readSchema, file $file") + var df = reader.schema(readSchema).load(file) + if (longTsCols.nonEmpty) { + // convert long type to timestamp type + for (tsCol <- longTsCols) { + logger.debug(s"cast $tsCol to timestamp") + df = df.withColumn(tsCol, (col(tsCol) / 1000).cast("timestamp")) + } + } + + if (loadDataSql != null && loadDataSql.nonEmpty) { + df.createOrReplaceTempView("file") + df = openmldbSession.sparksql(loadDataSql) + } + + if (logger.isDebugEnabled()) { + logger.debug(s"read dataframe schema: ${df.schema}, count: ${df.count()}") + df.show(10) + } + + // if we read non-streaming files, the df schema fields will be set as all nullable. + // so we need to set it right + if (!df.schema.equals(oriSchema)) { + logger.info(s"df schema: ${df.schema}, reset schema") + df.sqlContext.createDataFrame(df.rdd, oriSchema) + } else{ + df + } + } + + require(df.schema == oriSchema, s"schema mismatch, loaded ${df.schema} != table $oriSchema, check $file") + df + } + + // path can have prefix or not, we should remove it if exists + def catalogDest(path: String): String = { + path.split("://").last + } + + private def catalogLoad(openmldbSession: OpenmldbSession, file: String, columns: util.List[Common.ColumnDesc], + loadDataSql: String = ""): DataFrame = { + if (logger.isDebugEnabled()) { + logger.debug("session catalog {}", openmldbSession.getSparkSession.sessionState.catalog) + openmldbSession.sparksql("show tables").show() + } + // use sparksql to read catalog, no need to try openmldbsql and then fallback to sparksql + val df = if (loadDataSql != null && loadDataSql.nonEmpty) { + logger.debug("Try to execute custom SQL for catalog: " + loadDataSql) + openmldbSession.sparksql(loadDataSql) + } else { + openmldbSession.sparksql(s"SELECT * FROM ${catalogDest(file)}") + } + if (logger.isDebugEnabled()) { + logger.debug(s"read dataframe schema: ${df.schema}, count: ${df.count()}") + df.show(10) + } + + if (columns != null) { + val (oriSchema, readSchema, tsCols) = HybridseUtil.extractOriginAndReadSchema(columns) + + require(checkSchemaIgnoreNullable(df.schema, oriSchema), //df.schema == oriSchema, hive table always nullable? + s"schema mismatch(ignore nullable), loaded hive ${df.schema}!= table $oriSchema, check $file") + + if (!df.schema.equals(oriSchema)) { + logger.info(s"df schema: ${df.schema}, reset schema") + df.sqlContext.createDataFrame(df.rdd, oriSchema) + } else{ + df + } + } else { + df + } + + } +} diff --git a/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/utils/HybridseUtil.scala b/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/utils/HybridseUtil.scala index 8bf6897d82f..ee588ab677d 100644 --- a/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/utils/HybridseUtil.scala +++ b/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/utils/HybridseUtil.scala @@ -206,15 +206,19 @@ object HybridseUtil { } // 'file' may change the option 'format': - // If file starts with 'hive', format is hive, not the detail format in hive - // If file starts with 'file'/'hdfs', format is the file format - // result: format, options(spark write/read options), mode is common, if more options, set them to extra map + // If file starts with 'hive'/'iceberg', format is hive/iceberg, not the detail format in hive + // If file starts with 'openmldb', format is openmldb, not the detail format in openmldb + // Others, format is the origin format option + // **Result**: format, options(spark write/read options), mode is common, if more options, set them to extra map def parseOptions[T](file: String, node: T): (String, Map[String, String], String, Map[String, String]) = { // load data: read format, select into: write format + // parse hive/iceberg to avoid user forget to set format val format = if (file.toLowerCase().startsWith("hive://")) { "hive" + } else if (file.toLowerCase().startsWith("iceberg://")) { + "iceberg" } else if (file.toLowerCase().startsWith("openmldb://")) { - "openmldb" + "openmldb" // TODO(hw): no doc for it } else { parseOption(getOptionFromNode(node, "format"), "csv", getStringOrDefault).toLowerCase } @@ -247,16 +251,17 @@ object HybridseUtil { } // extra options for some special case - // only for PhysicalLoadDataNode var extraOptions: mutable.Map[String, String] = mutable.Map() + // only for PhysicalLoadDataNode extraOptions += ("deep_copy" -> parseOption(getOptionFromNode(node, "deep_copy"), "true", getBoolOrDefault)) - - // only for select into, "" means N/A - extraOptions += ("coalesce" -> parseOption(getOptionFromNode(node, "coalesce"), "0", getIntOrDefault)) - extraOptions += ("sql" -> parseOption(getOptionFromNode(node, "sql"), "", getStringOrDefault)) extraOptions += ("writer_type") -> parseOption(getOptionFromNode(node, "writer_type"), "single", getStringOrDefault) + extraOptions += ("sql" -> parseOption(getOptionFromNode(node, "sql"), "", getStringOrDefault)) + extraOptions += ("put_if_absent" -> parseOption(getOptionFromNode(node, "put_if_absent"), "false", + getBoolOrDefault)) + // only for select into, "" means N/A + extraOptions += ("coalesce" -> parseOption(getOptionFromNode(node, "coalesce"), "0", getIntOrDefault)) extraOptions += ("create_if_not_exists" -> parseOption(getOptionFromNode(node, "create_if_not_exists"), "true", getBoolOrDefault)) @@ -316,193 +321,16 @@ object HybridseUtil { longTsCols.toList } - def checkSchemaIgnoreNullable(actual: StructType, expect: StructType): Boolean = { - actual.zip(expect).forall{case (a, b) => (a.name, a.dataType) == (b.name, b.dataType)} - } - - def autoLoad(openmldbSession: OpenmldbSession, file: String, format: String, options: Map[String, String], - columns: util.List[Common.ColumnDesc]): DataFrame = { - autoLoad(openmldbSession, file, List.empty[String], format, options, columns, "") - } - - def autoLoad(openmldbSession: OpenmldbSession, file: String, format: String, options: Map[String, String], - columns: util.List[Common.ColumnDesc], loadDataSql: String): DataFrame = { - autoLoad(openmldbSession, file, List.empty[String], format, options, columns, loadDataSql) - } - - // Load df from file **and** symbol paths, they should in the same format and options. - // Decide which load method to use by arg `format`, DO NOT pass `hive://a.b` with format `csv`, - // the format should be `hive`. - // Use `parseOptions` in LoadData/SelectInto to get the right format(filePath & option `format`). - // valid pattern: - // 1. hive path, format must be hive, discard other options - // 2. file/hdfs path, format supports csv & parquet, other options take effect - // We use OpenmldbSession for running sparksql in hiveLoad. If in 4pd Spark distribution, SparkSession.sql - // will do openmldbSql first, and if DISABLE_OPENMLDB_FALLBACK, we can't use sparksql. - def autoLoad(openmldbSession: OpenmldbSession, file: String, symbolPaths: List[String], format: String, - options: Map[String, String], columns: util.List[Common.ColumnDesc], loadDataSql: String = "") - : DataFrame = { - val fmt = format.toLowerCase - if (fmt.equals("hive")) { - logger.info(s"load data from hive table $file & $symbolPaths") - if (file.isEmpty) { - var outputDf: DataFrame = null - symbolPaths.zipWithIndex.foreach { case (path, index) => - if (index == 0) { - outputDf = HybridseUtil.hiveLoad(openmldbSession, path, columns, loadDataSql) - } else { - outputDf = outputDf.union(HybridseUtil.hiveLoad(openmldbSession, path, columns, loadDataSql)) - } - } - outputDf - } else { - var outputDf = HybridseUtil.hiveLoad(openmldbSession, file, columns, loadDataSql) - for (path: String <- symbolPaths) { - outputDf = outputDf.union(HybridseUtil.hiveLoad(openmldbSession, path, columns, loadDataSql)) - } - outputDf - } - } else { - logger.info("load data from file {} & {} reader[format {}, options {}]", file, symbolPaths, fmt, options) - - if (file.isEmpty) { - var outputDf: DataFrame = null - symbolPaths.zipWithIndex.foreach { case (path, index) => - if (index == 0) { - outputDf = HybridseUtil.autoFileLoad(openmldbSession, path, fmt, options, columns, loadDataSql) - } else { - outputDf = outputDf.union(HybridseUtil.autoFileLoad(openmldbSession, path, fmt, options, columns, - loadDataSql)) - } - } - outputDf - } else { - var outputDf = HybridseUtil.autoFileLoad(openmldbSession, file, fmt, options, columns, loadDataSql) - for (path: String <- symbolPaths) { - outputDf = outputDf.union(HybridseUtil.autoFileLoad(openmldbSession, path, fmt, options, columns, - loadDataSql)) - } - outputDf - } - } - } - - // We want df with oriSchema, but if the file format is csv: - // 1. we support two format of timestamp - // 2. spark read may change the df schema to all nullable - // So we should fix it. - private def autoFileLoad(openmldbSession: OpenmldbSession, file: String, format: String, - options: Map[String, String], columns: util.List[Common.ColumnDesc], loadDataSql: String): DataFrame = { - require(format.equals("csv") || format.equals("parquet"), s"unsupported format $format") - val reader = openmldbSession.getSparkSession.read.options(options) - - val (oriSchema, readSchema, tsCols) = HybridseUtil.extractOriginAndReadSchema(columns) - var df = if (format.equals("parquet")) { - // When reading Parquet files, all columns are automatically converted to be nullable for compatibility reasons. - // ref https://spark.apache.org/docs/3.2.1/sql-data-sources-parquet.html - val df = if (loadDataSql != null && loadDataSql.nonEmpty) { - reader.format(format).load(file).createOrReplaceTempView("file") - openmldbSession.sparksql(loadDataSql) - } else { - reader.format(format).load(file) - } - - require(checkSchemaIgnoreNullable(df.schema, oriSchema), - s"schema mismatch(ignore nullable), loaded ${df.schema}!= table $oriSchema, check $file") - // reset nullable property - df.sqlContext.createDataFrame(df.rdd, oriSchema) - } else { - // csv should auto detect the timestamp format - reader.format(format) - // use string to read, then infer the format by the first non-null value of the ts column - val longTsCols = HybridseUtil.parseLongTsCols(reader, readSchema, tsCols, file) - logger.info(s"read schema: $readSchema, file $file") - var df = reader.schema(readSchema).load(file) - if (longTsCols.nonEmpty) { - // convert long type to timestamp type - for (tsCol <- longTsCols) { - logger.debug(s"cast $tsCol to timestamp") - df = df.withColumn(tsCol, (col(tsCol) / 1000).cast("timestamp")) - } - } - - if (loadDataSql != null && loadDataSql.nonEmpty) { - df.createOrReplaceTempView("file") - df = openmldbSession.sparksql(loadDataSql) - } - - if (logger.isDebugEnabled()) { - logger.debug(s"read dataframe schema: ${df.schema}, count: ${df.count()}") - df.show(10) - } - - // if we read non-streaming files, the df schema fields will be set as all nullable. - // so we need to set it right - if (!df.schema.equals(oriSchema)) { - logger.info(s"df schema: ${df.schema}, reset schema") - df.sqlContext.createDataFrame(df.rdd, oriSchema) - } else{ - df - } - } - - require(df.schema == oriSchema, s"schema mismatch, loaded ${df.schema} != table $oriSchema, check $file") - df - } - - def hiveDest(path: String): String = { - require(path.toLowerCase.startsWith("hive://"), s"invalid hive path $path") - // hive:// - val tableStartPos = 7 - path.substring(tableStartPos) - } - def getOpenmldbDbAndTable(path: String): (String, String) = { - require(path.toLowerCase.startsWith("openmldb://")) + require(path.toLowerCase.startsWith("openmldb://"), s"unsupported path $path") // openmldb:// val tableStartPos = 11 val dbAndTableString = path.substring(tableStartPos) - require(dbAndTableString.split("\\.").size == 2) + require(dbAndTableString.split("\\.").size == 2, s"invalid path $path") val db = dbAndTableString.split("\\.")(0) val table = dbAndTableString.split("\\.")(1) (db, table) } - - private def hiveLoad(openmldbSession: OpenmldbSession, file: String, columns: util.List[Common.ColumnDesc], - loadDataSql: String = ""): DataFrame = { - if (logger.isDebugEnabled()) { - logger.debug("session catalog {}", openmldbSession.getSparkSession.sessionState.catalog) - openmldbSession.sparksql("show tables").show() - } - // use sparksql to read hive, no need to try openmldbsql and then fallback to sparksql - val df = if (loadDataSql != null && loadDataSql.nonEmpty) { - logger.debug("Try to execute custom SQL for hive: " + loadDataSql) - openmldbSession.sparksql(loadDataSql) - } else { - openmldbSession.sparksql(s"SELECT * FROM ${hiveDest(file)}") - } - if (logger.isDebugEnabled()) { - logger.debug(s"read dataframe schema: ${df.schema}, count: ${df.count()}") - df.show(10) - } - - if (columns != null) { - val (oriSchema, readSchema, tsCols) = HybridseUtil.extractOriginAndReadSchema(columns) - - require(checkSchemaIgnoreNullable(df.schema, oriSchema), //df.schema == oriSchema, hive table always nullable? - s"schema mismatch(ignore nullable), loaded hive ${df.schema}!= table $oriSchema, check $file") - - if (!df.schema.equals(oriSchema)) { - logger.info(s"df schema: ${df.schema}, reset schema") - df.sqlContext.createDataFrame(df.rdd, oriSchema) - } else{ - df - } - } else { - df - } - - } } diff --git a/java/openmldb-batch/src/test/scala/com/_4paradigm/openmldb/batch/TestLoadDataPlan.scala b/java/openmldb-batch/src/test/scala/com/_4paradigm/openmldb/batch/TestLoadDataPlan.scala index ee8e3e2633f..7ae1d9914f2 100644 --- a/java/openmldb-batch/src/test/scala/com/_4paradigm/openmldb/batch/TestLoadDataPlan.scala +++ b/java/openmldb-batch/src/test/scala/com/_4paradigm/openmldb/batch/TestLoadDataPlan.scala @@ -162,8 +162,8 @@ class TestLoadDataPlan extends SparkTestSuite with Matchers { fail("unreachable") } - println("deep load data with invalid format option") - a[IllegalArgumentException] should be thrownBy { + println("deep load data with invalid format option, catalog will throw exception") + a[org.apache.spark.sql.catalyst.parser.ParseException] should be thrownBy { openmldbSession.openmldbSql(s"load data infile '$testFileWithHeader' into table $db.$table " + "options(format='txt', mode='overwrite');") fail("unreachable") diff --git a/java/openmldb-batch/src/test/scala/com/_4paradigm/openmldb/batch/utils/HybridseUtilTest.scala b/java/openmldb-batch/src/test/scala/com/_4paradigm/openmldb/batch/utils/DataSourceUtilTest.scala similarity index 97% rename from java/openmldb-batch/src/test/scala/com/_4paradigm/openmldb/batch/utils/HybridseUtilTest.scala rename to java/openmldb-batch/src/test/scala/com/_4paradigm/openmldb/batch/utils/DataSourceUtilTest.scala index 5299ae5ae25..726173f9eb1 100644 --- a/java/openmldb-batch/src/test/scala/com/_4paradigm/openmldb/batch/utils/HybridseUtilTest.scala +++ b/java/openmldb-batch/src/test/scala/com/_4paradigm/openmldb/batch/utils/DataSourceUtilTest.scala @@ -19,7 +19,7 @@ package com._4paradigm.openmldb.batch.utils import com._4paradigm.openmldb.batch.PlanContext import com._4paradigm.openmldb.batch.SparkTestSuite import com._4paradigm.openmldb.batch.api.OpenmldbSession -import com._4paradigm.openmldb.batch.utils.HybridseUtil.autoLoad +import com._4paradigm.openmldb.batch.utils.DataSourceUtil.autoLoad import com._4paradigm.openmldb.proto.{Common, Type} import org.apache.spark.sql.DataFrame import org.apache.spark.SparkConf @@ -27,7 +27,7 @@ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.AnalysisException import org.scalatest.Matchers -class HybridseUtilTest extends SparkTestSuite with Matchers { +class DataSourceUtilTest extends SparkTestSuite with Matchers { var openmldbSession: OpenmldbSession = _ override def customizedBefore(): Unit = { diff --git a/java/openmldb-batchjob/pom.xml b/java/openmldb-batchjob/pom.xml index 8f7c52efa6f..376c708f644 100644 --- a/java/openmldb-batchjob/pom.xml +++ b/java/openmldb-batchjob/pom.xml @@ -16,8 +16,8 @@ UTF-8 - 1.7 - 1.7 + 1.8 + 1.8 provided diff --git a/java/openmldb-common/src/main/java/com/_4paradigm/openmldb/common/codec/ClassicRowBuilder.java b/java/openmldb-common/src/main/java/com/_4paradigm/openmldb/common/codec/ClassicRowBuilder.java index 0b5048eff74..6f624e87351 100644 --- a/java/openmldb-common/src/main/java/com/_4paradigm/openmldb/common/codec/ClassicRowBuilder.java +++ b/java/openmldb-common/src/main/java/com/_4paradigm/openmldb/common/codec/ClassicRowBuilder.java @@ -17,8 +17,6 @@ import com._4paradigm.openmldb.proto.Type.DataType; import com._4paradigm.openmldb.proto.Common.ColumnDesc; -import org.joda.time.DateTime; - import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.sql.Date; diff --git a/java/openmldb-common/src/main/java/com/_4paradigm/openmldb/common/codec/FlexibleRowBuilder.java b/java/openmldb-common/src/main/java/com/_4paradigm/openmldb/common/codec/FlexibleRowBuilder.java index f11f2a31a64..7a3fa9f686c 100644 --- a/java/openmldb-common/src/main/java/com/_4paradigm/openmldb/common/codec/FlexibleRowBuilder.java +++ b/java/openmldb-common/src/main/java/com/_4paradigm/openmldb/common/codec/FlexibleRowBuilder.java @@ -390,7 +390,7 @@ public boolean build() { result.putInt(totalSize); // Size result.put(nullBitmap.getBuffer()); // BitMap result.put(baseFieldBuf.array()); // Base data type - result.put(strAddrBuf.array()); // String addr + result.put(strAddrBuf.array(), 0, strAddrLen); // String addr result.put(stringWriter.toByteArray()); // String value return true; } diff --git a/java/openmldb-common/src/test/java/com/_4paradigm/openmldb/common/RowCodecTest.java b/java/openmldb-common/src/test/java/com/_4paradigm/openmldb/common/RowCodecTest.java index c562cbd70ee..f4d9ab86757 100644 --- a/java/openmldb-common/src/test/java/com/_4paradigm/openmldb/common/RowCodecTest.java +++ b/java/openmldb-common/src/test/java/com/_4paradigm/openmldb/common/RowCodecTest.java @@ -1018,5 +1018,41 @@ public void testSpecialCase() { Assert.fail(); } } + + @Test + public void testLongString() { + try { + List schema = new ArrayList(); + schema.add(ColumnDesc.newBuilder().setName("col1").setDataType(DataType.kInt).build()); + schema.add(ColumnDesc.newBuilder().setName("col2").setDataType(DataType.kString).build()); + schema.add(ColumnDesc.newBuilder().setName("col3").setDataType(DataType.kInt).build()); + schema.add(ColumnDesc.newBuilder().setName("col4").setDataType(DataType.kString).build()); + schema.add(ColumnDesc.newBuilder().setName("col5").setDataType(DataType.kString).build()); + RowBuilder builder = new FlexibleRowBuilder(schema); + for (int i = 0; i < 20; i++) { + String str1 = genRandomString(100); + String str2 = i % 2 == 0 ? genRandomString(255) : genRandomString(20); + String str3 = i % 2 == 0 ? genRandomString(1000) : genRandomString(10); + Assert.assertTrue(builder.appendInt(1)); + Assert.assertTrue(builder.appendString(str1)); + Assert.assertTrue(builder.appendInt(10)); + Assert.assertTrue(builder.appendString(str2)); + Assert.assertTrue(builder.appendString(str3)); + builder.build(); + + ByteBuffer buffer = builder.getValue(); + RowView rowView = new RowView(schema, buffer, buffer.capacity()); + Assert.assertEquals(rowView.getInt(0), new Integer(1)); + Assert.assertEquals(rowView.getString(1), str1); + Assert.assertEquals(rowView.getString(3), str2); + Assert.assertEquals(rowView.getString(4), str3); + + ((FlexibleRowBuilder)builder).clear(); + } + } catch (Exception e) { + e.printStackTrace(); + Assert.assertTrue(false); + } + } } diff --git a/java/openmldb-import/src/main/java/com/_4paradigm/openmldb/importer/Importer.java b/java/openmldb-import/src/main/java/com/_4paradigm/openmldb/importer/Importer.java index b020f52dff4..29bc188e8d3 100644 --- a/java/openmldb-import/src/main/java/com/_4paradigm/openmldb/importer/Importer.java +++ b/java/openmldb-import/src/main/java/com/_4paradigm/openmldb/importer/Importer.java @@ -88,6 +88,12 @@ enum Mode { @CommandLine.Option(names = "--rpc_read_timeout", description = "rpc read timeout(ms)", defaultValue = "50000") private int rpcReadTimeout; + @CommandLine.Option(names = "--user", description = "the user to connect OpenMLDB", defaultValue = "root") + private String user; + + @CommandLine.Option(names = "--password", description = "the password", defaultValue = "") + private String password; + FilesReader reader = null; SqlExecutor router = null; @@ -108,6 +114,8 @@ public boolean setUpSDK() { SdkOption option = new SdkOption(); option.setZkCluster(zkCluster); option.setZkPath(zkRootPath); + option.setUser(user); + option.setPassword(password); try { router = new SqlClusterExecutor(option); return true; diff --git a/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/jdbc/SQLConnection.java b/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/jdbc/SQLConnection.java index 5383eaf246d..8259682755d 100644 --- a/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/jdbc/SQLConnection.java +++ b/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/jdbc/SQLConnection.java @@ -82,7 +82,8 @@ public java.sql.Statement createStatement() throws SQLException { @Override public java.sql.PreparedStatement prepareStatement(String sql) throws SQLException { String lower = sql.toLowerCase(); - if (lower.startsWith("insert into")) { + // insert, insert or xxx + if (lower.startsWith("insert ")) { return client.getInsertPreparedStmt(this.defaultDatabase, sql); } else if (lower.startsWith("select")) { return client.getPreparedStatement(this.defaultDatabase, sql); diff --git a/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/jdbc/SQLDriver.java b/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/jdbc/SQLDriver.java index e7edffb35de..40b8e7c3fdc 100644 --- a/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/jdbc/SQLDriver.java +++ b/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/jdbc/SQLDriver.java @@ -162,6 +162,10 @@ private SdkOption createOptionByProps(Properties properties) { if (prop != null) { option.setZkLogFile(prop); } + prop = properties.getProperty("zkCert"); + if (prop != null) { + option.setZkCert(prop); + } prop = properties.getProperty("glogLevel"); if (prop != null) { option.setGlogLevel(Integer.parseInt(prop)); @@ -174,6 +178,14 @@ private SdkOption createOptionByProps(Properties properties) { if (prop != null) { option.setMaxSqlCacheSize(Integer.parseInt(prop)); } + prop = properties.getProperty("user"); + if (prop != null) { + option.setUser(prop); + } + prop = properties.getProperty("password"); + if (prop != null) { + option.setPassword(prop); + } return option; } diff --git a/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/SdkOption.java b/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/SdkOption.java index eca5289bf32..98279ac267f 100644 --- a/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/SdkOption.java +++ b/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/SdkOption.java @@ -17,13 +17,14 @@ package com._4paradigm.openmldb.sdk; import lombok.Data; +import java.io.Serializable; import com._4paradigm.openmldb.BasicRouterOptions; import com._4paradigm.openmldb.SQLRouterOptions; import com._4paradigm.openmldb.StandaloneOptions; @Data -public class SdkOption { +public class SdkOption implements Serializable { // TODO(hw): set isClusterMode automatically private boolean isClusterMode = true; // options for cluster mode @@ -46,6 +47,8 @@ public class SdkOption { private String glogDir = ""; private int maxSqlCacheSize = 50; private boolean isLight = false; + private String user = "root"; + private String password = ""; private void buildBaseOptions(BasicRouterOptions opt) { opt.setEnable_debug(getEnableDebug()); @@ -53,6 +56,10 @@ private void buildBaseOptions(BasicRouterOptions opt) { opt.setGlog_level(getGlogLevel()); opt.setGlog_dir(getGlogDir()); opt.setMax_sql_cache_size(getMaxSqlCacheSize()); + opt.setUser(getUser()); + if (!getPassword().isEmpty()) { + opt.setPassword(getPassword()); + } } public SQLRouterOptions buildSQLRouterOptions() throws SqlException { diff --git a/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/impl/InsertPreparedStatementImpl.java b/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/impl/InsertPreparedStatementImpl.java index 6acefe8acff..ecc39b467c1 100644 --- a/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/impl/InsertPreparedStatementImpl.java +++ b/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/impl/InsertPreparedStatementImpl.java @@ -319,7 +319,7 @@ public boolean execute() throws SQLException { // actually only one row boolean ok = router.ExecuteInsert(cache.getDatabase(), cache.getName(), cache.getTid(), cache.getPartitionNum(), - dimensions.array(), dimensions.capacity(), value.array(), value.capacity(), status); + dimensions.array(), dimensions.capacity(), value.array(), value.capacity(), cache.isPutIfAbsent(), status); // cleanup rows even if insert failed // we can't execute() again without set new row, so we must clean up here clearParameters(); @@ -381,7 +381,7 @@ public int[] executeBatch() throws SQLException { boolean ok = router.ExecuteInsert(cache.getDatabase(), cache.getName(), cache.getTid(), cache.getPartitionNum(), pair.getKey().array(), pair.getKey().capacity(), - pair.getValue().array(), pair.getValue().capacity(), status); + pair.getValue().array(), pair.getValue().capacity(), cache.isPutIfAbsent(), status); if (!ok) { // TODO(hw): may lost log, e.g. openmldb-batch online import in yarn mode? logger.warn(status.ToString()); diff --git a/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/impl/InsertPreparedStatementMeta.java b/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/impl/InsertPreparedStatementMeta.java index 448438e9d31..cf2bd05cb58 100644 --- a/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/impl/InsertPreparedStatementMeta.java +++ b/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/impl/InsertPreparedStatementMeta.java @@ -31,6 +31,7 @@ public class InsertPreparedStatementMeta { private Set indexPos = new HashSet<>(); private Map> indexMap = new HashMap<>(); private Map defaultIndexValue = new HashMap<>(); + private boolean putIfAbsent; public InsertPreparedStatementMeta(String sql, NS.TableInfo tableInfo, SQLInsertRow insertRow) { this.sql = sql; @@ -51,6 +52,7 @@ public InsertPreparedStatementMeta(String sql, NS.TableInfo tableInfo, SQLInsert VectorUint32 idxArray = insertRow.GetHoleIdx(); buildHoleIdx(idxArray); idxArray.delete(); + putIfAbsent = insertRow.IsPutIfAbsent(); } private void buildIndex(NS.TableInfo tableInfo) { @@ -215,4 +217,8 @@ Map> getIndexMap() { Map getDefaultIndexValue() { return defaultIndexValue; } + + public boolean isPutIfAbsent() { + return putIfAbsent; + } } diff --git a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/OpenmldbConfig.java b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/OpenmldbConfig.java new file mode 100644 index 00000000000..7c0981d0a6c --- /dev/null +++ b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/OpenmldbConfig.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com._4paradigm.openmldb.spark; + +import com._4paradigm.openmldb.sdk.SdkOption; + +import java.io.Serializable; + +import org.sparkproject.guava.base.Preconditions; + +// Must serializable +public class OpenmldbConfig implements Serializable { + public final static String DB = "db"; + public final static String TABLE = "table"; + public final static String ZK_CLUSTER = "zkCluster"; + public final static String ZK_PATH = "zkPath"; + + /* read&write */ + private String dbName; + private String tableName; + private SdkOption option = null; + + /* write */ + // single: insert when read one row + // batch: insert when commit(after read a whole partition) + private String writerType = "single"; + private int insertMemoryUsageLimit = 0; + private boolean putIfAbsent = false; + + public OpenmldbConfig() { + } + + public void setDB(String dbName) { + Preconditions.checkArgument(dbName != null && !dbName.isEmpty(), "db name must not be empty"); + this.dbName = dbName; + } + + public String getDB() { + return this.dbName; + } + + public void setTable(String tableName) { + Preconditions.checkArgument(tableName != null && !tableName.isEmpty(), "table name must not be empty"); + this.tableName = tableName; + } + + public String getTable() { + return this.tableName; + } + + public void setSdkOption(SdkOption option) { + this.option = option; + } + + public SdkOption getSdkOption() { + return this.option; + } + + public void setWriterType(String string) { + Preconditions.checkArgument(string.equals("single") || string.equals("batch"), + "writerType must be 'single' or 'batch'"); + this.writerType = string; + } + + public void setInsertMemoryUsageLimit(int int1) { + Preconditions.checkArgument(int1 >= 0, "insert_memory_usage_limit must be >= 0"); + this.insertMemoryUsageLimit = int1; + } + + public void setPutIfAbsent(Boolean valueOf) { + this.putIfAbsent = valueOf; + } + + public boolean isBatchWriter() { + return this.writerType.equals("batch"); + } + + public boolean putIfAbsent() { + return this.putIfAbsent; + } + + public int getInsertMemoryUsageLimit() { + return this.insertMemoryUsageLimit; + } + +} diff --git a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/OpenmldbSource.java b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/OpenmldbSource.java index 7e626f623ea..d4f4a48617d 100644 --- a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/OpenmldbSource.java +++ b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/OpenmldbSource.java @@ -18,7 +18,6 @@ package com._4paradigm.openmldb.spark; import com._4paradigm.openmldb.sdk.SdkOption; -import com.google.common.base.Preconditions; import org.apache.spark.sql.connector.catalog.Table; import org.apache.spark.sql.connector.catalog.TableProvider; import org.apache.spark.sql.connector.expressions.Transform; @@ -29,32 +28,20 @@ import java.util.Map; public class OpenmldbSource implements TableProvider, DataSourceRegister { - private final String DB = "db"; - private final String TABLE = "table"; - private final String ZK_CLUSTER = "zkCluster"; - private final String ZK_PATH = "zkPath"; - private String dbName; - private String tableName; - private SdkOption option = null; - // single: insert when read one row - // batch: insert when commit(after read a whole partition) - private String writerType = "single"; - private int insertMemoryUsageLimit = 0; + private OpenmldbConfig config = new OpenmldbConfig(); @Override public StructType inferSchema(CaseInsensitiveStringMap options) { - Preconditions.checkNotNull(dbName = options.get(DB)); - Preconditions.checkNotNull(tableName = options.get(TABLE)); + config.setDB(options.get(OpenmldbConfig.DB)); + config.setTable(options.get(OpenmldbConfig.TABLE)); - String zkCluster = options.get(ZK_CLUSTER); - String zkPath = options.get(ZK_PATH); - Preconditions.checkNotNull(zkCluster); - Preconditions.checkNotNull(zkPath); - option = new SdkOption(); - option.setZkCluster(zkCluster); - option.setZkPath(zkPath); + SdkOption option = new SdkOption(); + option.setZkCluster(options.get(OpenmldbConfig.ZK_CLUSTER)); + option.setZkPath(options.get(OpenmldbConfig.ZK_PATH)); option.setLight(true); + config.setSdkOption(option); + String timeout = options.get("sessionTimeout"); if (timeout != null) { option.setSessionTimeout(Integer.parseInt(timeout)); @@ -63,24 +50,35 @@ public StructType inferSchema(CaseInsensitiveStringMap options) { if (timeout != null) { option.setRequestTimeout(Integer.parseInt(timeout)); } + String user = options.get("user"); + if (user != null) { + option.setUser(user); + } + String password = options.get("password"); + if (password != null) { + option.setPassword(password); + } String debug = options.get("debug"); if (debug != null) { option.setEnableDebug(Boolean.valueOf(debug)); } if (options.containsKey("writerType")) { - writerType = options.get("writerType"); + config.setWriterType(options.get("writerType")); + } + if (options.containsKey("putIfAbsent")) { + config.setPutIfAbsent(Boolean.valueOf(options.get("putIfAbsent"))); } if (options.containsKey("insert_memory_usage_limit")) { - insertMemoryUsageLimit = Integer.parseInt(options.get("insert_memory_usage_limit")); + config.setInsertMemoryUsageLimit(Integer.parseInt(options.get("insert_memory_usage_limit"))); } return null; } @Override public Table getTable(StructType schema, Transform[] partitioning, Map properties) { - return new OpenmldbTable(dbName, tableName, option, writerType, insertMemoryUsageLimit); + return new OpenmldbTable(config); } @Override diff --git a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/OpenmldbTable.java b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/OpenmldbTable.java index 481a9cc1f4c..e5cbcfe40ca 100644 --- a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/OpenmldbTable.java +++ b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/OpenmldbTable.java @@ -22,10 +22,8 @@ import com._4paradigm.openmldb.sdk.SqlException; import com._4paradigm.openmldb.sdk.SqlExecutor; import com._4paradigm.openmldb.sdk.impl.SqlClusterExecutor; -import com._4paradigm.openmldb.spark.read.OpenmldbReadConfig; import com._4paradigm.openmldb.spark.read.OpenmldbScanBuilder; import com._4paradigm.openmldb.spark.write.OpenmldbWriteBuilder; -import com._4paradigm.openmldb.spark.write.OpenmldbWriteConfig; import org.apache.spark.sql.connector.catalog.SupportsRead; import org.apache.spark.sql.connector.catalog.SupportsWrite; import org.apache.spark.sql.connector.catalog.TableCapability; @@ -45,40 +43,32 @@ import java.util.Set; public class OpenmldbTable implements SupportsWrite, SupportsRead { - private final String dbName; - private final String tableName; - private final SdkOption option; - private final String writerType; - private final int insertMemoryUsageLimit; - private SqlExecutor executor = null; + private OpenmldbConfig config; + private SqlExecutor executor; private Set capabilities; - public OpenmldbTable(String dbName, String tableName, SdkOption option, String writerType, int insertMemoryUsageLimit) { - this.dbName = dbName; - this.tableName = tableName; - this.option = option; - this.writerType = writerType; - this.insertMemoryUsageLimit = insertMemoryUsageLimit; + public OpenmldbTable(OpenmldbConfig config) { + this.config = config; try { - this.executor = new SqlClusterExecutor(option); + this.executor = new SqlClusterExecutor(config.getSdkOption()); // no need to check table exists, schema() will check it later } catch (SqlException e) { e.printStackTrace(); + throw new RuntimeException("conn openmldb failed", e); } // TODO: cache schema & delete executor? } @Override public WriteBuilder newWriteBuilder(LogicalWriteInfo info) { - OpenmldbWriteConfig config = new OpenmldbWriteConfig(dbName, tableName, option, writerType, insertMemoryUsageLimit); return new OpenmldbWriteBuilder(config, info); } @Override public String name() { // TODO(hw): db? - return tableName; + return config.getTable(); } public static DataType sdkTypeToSparkType(int sqlType) { @@ -109,7 +99,7 @@ public static DataType sdkTypeToSparkType(int sqlType) { @Override public StructType schema() { try { - Schema schema = executor.getTableSchema(dbName, tableName); + Schema schema = executor.getTableSchema(config.getDB(), config.getTable()); List schemaList = schema.getColumnList(); StructField[] fields = new StructField[schemaList.size()]; for (int i = 0; i < schemaList.size(); i++) { @@ -136,7 +126,6 @@ public Set capabilities() { @Override public ScanBuilder newScanBuilder(CaseInsensitiveStringMap caseInsensitiveStringMap) { - OpenmldbReadConfig config = new OpenmldbReadConfig(dbName, tableName, option); return new OpenmldbScanBuilder(config); } } diff --git a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbPartitionReaderFactory.java b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbPartitionReaderFactory.java index d5e435fc247..929d30b728e 100644 --- a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbPartitionReaderFactory.java +++ b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbPartitionReaderFactory.java @@ -17,15 +17,16 @@ package com._4paradigm.openmldb.spark.read; +import com._4paradigm.openmldb.spark.OpenmldbConfig; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.connector.read.InputPartition; import org.apache.spark.sql.connector.read.PartitionReader; import org.apache.spark.sql.connector.read.PartitionReaderFactory; public class OpenmldbPartitionReaderFactory implements PartitionReaderFactory { - private final OpenmldbReadConfig config; + private final OpenmldbConfig config; - public OpenmldbPartitionReaderFactory(OpenmldbReadConfig config) { + public OpenmldbPartitionReaderFactory(OpenmldbConfig config) { this.config = config; } diff --git a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbReadConfig.java b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbReadConfig.java deleted file mode 100644 index 91489888ba9..00000000000 --- a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbReadConfig.java +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com._4paradigm.openmldb.spark.read; - -import com._4paradigm.openmldb.sdk.SdkOption; -import java.io.Serializable; - -// Must serializable -public class OpenmldbReadConfig implements Serializable { - public final String dbName, tableName, zkCluster, zkPath; - - public OpenmldbReadConfig(String dbName, String tableName, SdkOption option) { - this.dbName = dbName; - this.tableName = tableName; - this.zkCluster = option.getZkCluster(); - this.zkPath = option.getZkPath(); - // TODO(hw): other configs in SdkOption - } -} diff --git a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbScan.java b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbScan.java index fb7adb46b8e..4eeac9a6013 100644 --- a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbScan.java +++ b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbScan.java @@ -17,6 +17,7 @@ package com._4paradigm.openmldb.spark.read; +import com._4paradigm.openmldb.spark.OpenmldbConfig; import org.apache.spark.sql.connector.read.Batch; import org.apache.spark.sql.connector.read.InputPartition; import org.apache.spark.sql.connector.read.PartitionReaderFactory; @@ -24,9 +25,9 @@ import org.apache.spark.sql.types.StructType; public class OpenmldbScan implements Scan, Batch { - private final OpenmldbReadConfig config; + private final OpenmldbConfig config; - public OpenmldbScan(OpenmldbReadConfig config) { + public OpenmldbScan(OpenmldbConfig config) { this.config = config; } diff --git a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbScanBuilder.java b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbScanBuilder.java index 2b500a6592e..de59a811f46 100644 --- a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbScanBuilder.java +++ b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbScanBuilder.java @@ -17,13 +17,14 @@ package com._4paradigm.openmldb.spark.read; +import com._4paradigm.openmldb.spark.OpenmldbConfig; import org.apache.spark.sql.connector.read.Scan; import org.apache.spark.sql.connector.read.ScanBuilder; public class OpenmldbScanBuilder implements ScanBuilder { - private final OpenmldbReadConfig config; + private final OpenmldbConfig config; - public OpenmldbScanBuilder(OpenmldbReadConfig config) { + public OpenmldbScanBuilder(OpenmldbConfig config) { this.config = config; } diff --git a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbBatchWrite.java b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbBatchWrite.java index ca90a07d63a..d19fd9f6aeb 100644 --- a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbBatchWrite.java +++ b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbBatchWrite.java @@ -17,6 +17,7 @@ package com._4paradigm.openmldb.spark.write; +import com._4paradigm.openmldb.spark.OpenmldbConfig; import org.apache.spark.sql.connector.write.BatchWrite; import org.apache.spark.sql.connector.write.DataWriterFactory; import org.apache.spark.sql.connector.write.LogicalWriteInfo; @@ -24,10 +25,10 @@ import org.apache.spark.sql.connector.write.WriterCommitMessage; public class OpenmldbBatchWrite implements BatchWrite { - private final OpenmldbWriteConfig config; + private final OpenmldbConfig config; private final LogicalWriteInfo info; - public OpenmldbBatchWrite(OpenmldbWriteConfig config, LogicalWriteInfo info) { + public OpenmldbBatchWrite(OpenmldbConfig config, LogicalWriteInfo info) { this.config = config; this.info = info; } diff --git a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbDataSingleWriter.java b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbDataSingleWriter.java index 843fb9a8da7..4d42c7ab0cc 100644 --- a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbDataSingleWriter.java +++ b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbDataSingleWriter.java @@ -17,8 +17,9 @@ package com._4paradigm.openmldb.spark.write; +import com._4paradigm.openmldb.spark.OpenmldbConfig; + import com._4paradigm.openmldb.sdk.Schema; -import com._4paradigm.openmldb.sdk.SdkOption; import com._4paradigm.openmldb.sdk.SqlException; import com._4paradigm.openmldb.sdk.impl.SqlClusterExecutor; import com.google.common.base.Preconditions; @@ -27,32 +28,26 @@ import org.apache.spark.sql.connector.write.WriterCommitMessage; import java.io.IOException; -import java.sql.Date; import java.sql.PreparedStatement; import java.sql.ResultSetMetaData; import java.sql.SQLException; -import java.sql.Timestamp; -import java.sql.Types; public class OpenmldbDataSingleWriter implements DataWriter { private final int partitionId; private final long taskId; private PreparedStatement preparedStatement = null; - public OpenmldbDataSingleWriter(OpenmldbWriteConfig config, int partitionId, long taskId) { + public OpenmldbDataSingleWriter(OpenmldbConfig config, int partitionId, long taskId) { try { - SdkOption option = new SdkOption(); - option.setZkCluster(config.zkCluster); - option.setZkPath(config.zkPath); - option.setLight(true); - SqlClusterExecutor executor = new SqlClusterExecutor(option); - String dbName = config.dbName; - String tableName = config.tableName; - executor.executeSQL(dbName, "SET @@insert_memory_usage_limit=" + config.insertMemoryUsageLimit); + SqlClusterExecutor executor = new SqlClusterExecutor(config.getSdkOption()); + String dbName = config.getDB(); + String tableName = config.getTable(); + executor.executeSQL(dbName, "SET @@insert_memory_usage_limit=" + config.getInsertMemoryUsageLimit()); Schema schema = executor.getTableSchema(dbName, tableName); // create insert placeholder - StringBuilder insert = new StringBuilder("insert into " + tableName + " values(?"); + String insert_part = config.putIfAbsent()? "insert or ignore into " : "insert into "; + StringBuilder insert = new StringBuilder(insert_part + tableName + " values(?"); for (int i = 1; i < schema.getColumnList().size(); i++) { insert.append(",?"); } @@ -60,6 +55,7 @@ public OpenmldbDataSingleWriter(OpenmldbWriteConfig config, int partitionId, lon preparedStatement = executor.getInsertPreparedStmt(dbName, insert.toString()); } catch (SQLException | SqlException e) { e.printStackTrace(); + throw new RuntimeException("create openmldb writer failed", e); } this.partitionId = partitionId; @@ -73,32 +69,28 @@ public void write(InternalRow record) throws IOException { ResultSetMetaData metaData = preparedStatement.getMetaData(); Preconditions.checkState(record.numFields() == metaData.getColumnCount()); OpenmldbDataWriter.addRow(record, preparedStatement); - preparedStatement.execute(); + + // you can cache failed rows and throw exception when commit/close, + // but it still may interrupt other writers(pending or slow writers) + + // check return for put result + if(!preparedStatement.execute()) { + throw new IOException("execute failed"); + } } catch (Exception e) { - throw new IOException("write row to openmldb failed on " + record, e); + throw new IOException("write row to openmldb failed on " + OpenmldbDataWriter.readable(record, preparedStatement), e); } } @Override public WriterCommitMessage commit() throws IOException { - try { - preparedStatement.close(); - } catch (SQLException e) { - e.printStackTrace(); - throw new IOException("commit error", e); - } - // TODO(hw): need to return new WriterCommitMessageImpl(partitionId, taskId); ? + // no transaction, no commit return null; } @Override public void abort() throws IOException { - try { - preparedStatement.close(); - } catch (SQLException e) { - e.printStackTrace(); - throw new IOException("abort error", e); - } + // no transaction, no abort } @Override diff --git a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbDataWriter.java b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbDataWriter.java index e9ba0e30c5a..75dabc867d7 100644 --- a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbDataWriter.java +++ b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbDataWriter.java @@ -17,10 +17,13 @@ package com._4paradigm.openmldb.spark.write; +import com._4paradigm.openmldb.spark.OpenmldbConfig; + import com._4paradigm.openmldb.sdk.Schema; import com._4paradigm.openmldb.sdk.SdkOption; import com._4paradigm.openmldb.sdk.SqlException; import com._4paradigm.openmldb.sdk.impl.SqlClusterExecutor; +import com._4paradigm.openmldb.spark.OpenmldbTable; import com.google.common.base.Preconditions; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.connector.write.DataWriter; @@ -39,20 +42,17 @@ public class OpenmldbDataWriter implements DataWriter { private final long taskId; private PreparedStatement preparedStatement = null; - public OpenmldbDataWriter(OpenmldbWriteConfig config, int partitionId, long taskId) { + public OpenmldbDataWriter(OpenmldbConfig config, int partitionId, long taskId) { try { - SdkOption option = new SdkOption(); - option.setZkCluster(config.zkCluster); - option.setZkPath(config.zkPath); - option.setLight(true); - SqlClusterExecutor executor = new SqlClusterExecutor(option); - String dbName = config.dbName; - String tableName = config.tableName; - executor.executeSQL(dbName, "SET @@insert_memory_usage_limit=" + config.insertMemoryUsageLimit); + SqlClusterExecutor executor = new SqlClusterExecutor(config.getSdkOption()); + String dbName = config.getDB(); + String tableName = config.getTable(); + executor.executeSQL(dbName, "SET @@insert_memory_usage_limit=" + config.getInsertMemoryUsageLimit()); Schema schema = executor.getTableSchema(dbName, tableName); // create insert placeholder - StringBuilder insert = new StringBuilder("insert into " + tableName + " values(?"); + String insert_part = config.putIfAbsent()? "insert or ignore into " : "insert into "; + StringBuilder insert = new StringBuilder(insert_part + tableName + " values(?"); for (int i = 1; i < schema.getColumnList().size(); i++) { insert.append(",?"); } @@ -60,6 +60,7 @@ public OpenmldbDataWriter(OpenmldbWriteConfig config, int partitionId, long task preparedStatement = executor.getInsertPreparedStmt(dbName, insert.toString()); } catch (SQLException | SqlException e) { e.printStackTrace(); + throw new RuntimeException("create openmldb data writer failed", e); } this.partitionId = partitionId; @@ -75,7 +76,7 @@ public void write(InternalRow record) throws IOException { addRow(record, preparedStatement); preparedStatement.addBatch(); } catch (Exception e) { - throw new IOException("convert to openmldb row failed on " + record, e); + throw new IOException("convert to openmldb row failed on " + readable(record, preparedStatement), e); } } @@ -126,6 +127,19 @@ static void addRow(InternalRow record, PreparedStatement preparedStatement) thro } } + static String readable(InternalRow record, PreparedStatement preparedStatement) { + try { + ResultSetMetaData metaData = preparedStatement.getMetaData(); + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < record.numFields(); i++) { + sb.append(record.get(i, OpenmldbTable.sdkTypeToSparkType(metaData.getColumnType(i + 1)))).append(","); + } + return sb.toString(); + } catch (SQLException e) { + return "readable error: " + e.getMessage(); + } + } + @Override public WriterCommitMessage commit() throws IOException { try { @@ -147,12 +161,7 @@ public WriterCommitMessage commit() throws IOException { @Override public void abort() throws IOException { - try { - preparedStatement.close(); - } catch (SQLException e) { - e.printStackTrace(); - throw new IOException("abort error", e); - } + // no transaction, no abort } @Override diff --git a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbDataWriterFactory.java b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbDataWriterFactory.java index 96e78979b2f..12cefb3928b 100644 --- a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbDataWriterFactory.java +++ b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbDataWriterFactory.java @@ -17,20 +17,21 @@ package com._4paradigm.openmldb.spark.write; +import com._4paradigm.openmldb.spark.OpenmldbConfig; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.connector.write.DataWriter; import org.apache.spark.sql.connector.write.DataWriterFactory; public class OpenmldbDataWriterFactory implements DataWriterFactory { - private final OpenmldbWriteConfig config; + private final OpenmldbConfig config; - public OpenmldbDataWriterFactory(OpenmldbWriteConfig config) { + public OpenmldbDataWriterFactory(OpenmldbConfig config) { this.config = config; } @Override public DataWriter createWriter(int partitionId, long taskId) { - if (!config.writerType.equals("batch")) { + if (!config.isBatchWriter()) { return new OpenmldbDataSingleWriter(config, partitionId, taskId); } return new OpenmldbDataWriter(config, partitionId, taskId); diff --git a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbWriteBuilder.java b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbWriteBuilder.java index a3c905b15c1..ccd588df0c4 100644 --- a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbWriteBuilder.java +++ b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbWriteBuilder.java @@ -17,15 +17,16 @@ package com._4paradigm.openmldb.spark.write; +import com._4paradigm.openmldb.spark.OpenmldbConfig; import org.apache.spark.sql.connector.write.BatchWrite; import org.apache.spark.sql.connector.write.LogicalWriteInfo; import org.apache.spark.sql.connector.write.WriteBuilder; public class OpenmldbWriteBuilder implements WriteBuilder { - private final OpenmldbWriteConfig config; + private final OpenmldbConfig config; private final LogicalWriteInfo info; - public OpenmldbWriteBuilder(OpenmldbWriteConfig config, LogicalWriteInfo info) { + public OpenmldbWriteBuilder(OpenmldbConfig config, LogicalWriteInfo info) { this.config = config; this.info = info; } diff --git a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbWriteConfig.java b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbWriteConfig.java deleted file mode 100644 index 80007b14ae5..00000000000 --- a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbWriteConfig.java +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com._4paradigm.openmldb.spark.write; - -import com._4paradigm.openmldb.sdk.SdkOption; - -import java.io.Serializable; - -// Must serializable -public class OpenmldbWriteConfig implements Serializable { - public final String dbName, tableName, zkCluster, zkPath, writerType; - public final int insertMemoryUsageLimit; - - public OpenmldbWriteConfig(String dbName, String tableName, SdkOption option, String writerType, int insertMemoryUsageLimit) { - this.dbName = dbName; - this.tableName = tableName; - this.zkCluster = option.getZkCluster(); - this.zkPath = option.getZkPath(); - this.writerType = writerType; - this.insertMemoryUsageLimit = insertMemoryUsageLimit; - // TODO(hw): other configs in SdkOption - } -} diff --git a/java/openmldb-spark-connector/src/main/scala/com/_4paradigm/openmldb/spark/read/OpenmldbPartitionReader.scala b/java/openmldb-spark-connector/src/main/scala/com/_4paradigm/openmldb/spark/read/OpenmldbPartitionReader.scala index d8eeb89e7ab..86921d0f4d5 100644 --- a/java/openmldb-spark-connector/src/main/scala/com/_4paradigm/openmldb/spark/read/OpenmldbPartitionReader.scala +++ b/java/openmldb-spark-connector/src/main/scala/com/_4paradigm/openmldb/spark/read/OpenmldbPartitionReader.scala @@ -1,5 +1,6 @@ package com._4paradigm.openmldb.spark.read +import com._4paradigm.openmldb.spark.OpenmldbConfig import com._4paradigm.openmldb.sdk.{Schema, SdkOption} import com._4paradigm.openmldb.sdk.impl.SqlClusterExecutor import org.apache.spark.sql.catalyst.InternalRow @@ -8,15 +9,10 @@ import org.apache.spark.unsafe.types.UTF8String import java.sql.Types -class OpenmldbPartitionReader(config: OpenmldbReadConfig) extends PartitionReader[InternalRow] { - - val option = new SdkOption - option.setZkCluster(config.zkCluster) - option.setZkPath(config.zkPath) - option.setLight(true) - val executor = new SqlClusterExecutor(option) - val dbName: String = config.dbName - val tableName: String = config.tableName +class OpenmldbPartitionReader(config: OpenmldbConfig) extends PartitionReader[InternalRow] { + val executor = new SqlClusterExecutor(config.getSdkOption) + val dbName: String = config.getDB + val tableName: String = config.getTable val schema: Schema = executor.getTableSchema(dbName, tableName) executor.executeSQL(dbName, "SET @@execute_mode='online'") diff --git a/java/openmldb-synctool/src/main/java/com/_4paradigm/openmldb/synctool/SyncToolConfig.java b/java/openmldb-synctool/src/main/java/com/_4paradigm/openmldb/synctool/SyncToolConfig.java index 4fdb22834db..5fa14e2dc0e 100644 --- a/java/openmldb-synctool/src/main/java/com/_4paradigm/openmldb/synctool/SyncToolConfig.java +++ b/java/openmldb-synctool/src/main/java/com/_4paradigm/openmldb/synctool/SyncToolConfig.java @@ -37,6 +37,9 @@ public class SyncToolConfig { // public static int CHANNEL_KEEP_ALIVE_TIME; public static String ZK_CLUSTER; public static String ZK_ROOT_PATH; + + public static String USER; + public static String PASSWORD; public static String ZK_CERT; public static String SYNC_TASK_PROGRESS_PATH; @@ -87,6 +90,8 @@ private static void parseFromProperties(Properties prop) { if (ZK_ROOT_PATH.isEmpty()) { throw new RuntimeException("zookeeper.root_path should not be empty"); } + USER = prop.getProperty("user", "root"); + PASSWORD = prop.getProperty("password", ""); ZK_CERT = prop.getProperty("zookeeper.cert", ""); HADOOP_CONF_DIR = prop.getProperty("hadoop.conf.dir", ""); diff --git a/java/openmldb-synctool/src/main/java/com/_4paradigm/openmldb/synctool/SyncToolImpl.java b/java/openmldb-synctool/src/main/java/com/_4paradigm/openmldb/synctool/SyncToolImpl.java index 0e98cffa6f3..0685ab310d4 100644 --- a/java/openmldb-synctool/src/main/java/com/_4paradigm/openmldb/synctool/SyncToolImpl.java +++ b/java/openmldb-synctool/src/main/java/com/_4paradigm/openmldb/synctool/SyncToolImpl.java @@ -92,6 +92,8 @@ public SyncToolImpl(String endpoint) throws SqlException, InterruptedException { option.setZkCluster(SyncToolConfig.ZK_CLUSTER); option.setZkPath(SyncToolConfig.ZK_ROOT_PATH); option.setZkCert(SyncToolConfig.ZK_CERT); + option.setUser(SyncToolConfig.USER); + option.setPassword(SyncToolConfig.PASSWORD); this.router = new SqlClusterExecutor(option); this.zkCollectorPath = SyncToolConfig.ZK_ROOT_PATH + "/sync_tool/collector"; diff --git a/java/openmldb-taskmanager/src/main/java/com/_4paradigm/openmldb/taskmanager/config/TaskManagerConfig.java b/java/openmldb-taskmanager/src/main/java/com/_4paradigm/openmldb/taskmanager/config/TaskManagerConfig.java index bba740a2ffa..d849137fb3a 100644 --- a/java/openmldb-taskmanager/src/main/java/com/_4paradigm/openmldb/taskmanager/config/TaskManagerConfig.java +++ b/java/openmldb-taskmanager/src/main/java/com/_4paradigm/openmldb/taskmanager/config/TaskManagerConfig.java @@ -121,6 +121,10 @@ public static int getZkMaxConnectWaitTime() { return getInt("zookeeper.max_connect_waitTime"); } + public static String getUser() { return getString("user"); } + + public static String getPassword() { return getString("password"); } + public static String getSparkMaster() { return getString("spark.master"); } @@ -283,6 +287,14 @@ private void init() throws ConfigException { props.setProperty("zookeeper.session_timeout", "5000"); } + if (props.getProperty("user") == null) { + props.setProperty("user", "root"); + } + + if (props.getProperty("password") == null) { + props.setProperty("password", ""); + } + if (getZkSessionTimeout() <= 0) { throw new ConfigException("zookeeper.session_timeout", "should be larger than 0"); } diff --git a/java/openmldb-taskmanager/src/main/scala/com/_4paradigm/openmldb/taskmanager/JobInfoManager.scala b/java/openmldb-taskmanager/src/main/scala/com/_4paradigm/openmldb/taskmanager/JobInfoManager.scala index cd5c65e2cc4..73394749313 100644 --- a/java/openmldb-taskmanager/src/main/scala/com/_4paradigm/openmldb/taskmanager/JobInfoManager.scala +++ b/java/openmldb-taskmanager/src/main/scala/com/_4paradigm/openmldb/taskmanager/JobInfoManager.scala @@ -45,6 +45,10 @@ object JobInfoManager { private val option = new SdkOption option.setZkCluster(TaskManagerConfig.getZkCluster) option.setZkPath(TaskManagerConfig.getZkRootPath) + option.setUser(TaskManagerConfig.getUser) + if (!TaskManagerConfig.getPassword.isEmpty) { + option.setPassword(TaskManagerConfig.getPassword) + } val sqlExecutor = new SqlClusterExecutor(option) sqlExecutor.executeSQL("", "set @@execute_mode='online';") diff --git a/java/pom.xml b/java/pom.xml index 0e67ba05a15..f5f46b726b6 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -294,7 +294,7 @@ ${project.build.directory}/surefire-reports . WDF TestSuite.txt - -Xmx8192m -XX:MaxPermSize=2048m -Duser.timezone=GMT+8 + -Xmx8192m -XX:MaxMetaspaceSize=2048m -Duser.timezone=GMT+8 ${scalatest.skip} diff --git a/python/openmldb_sdk/openmldb/sdk/sdk.py b/python/openmldb_sdk/openmldb/sdk/sdk.py index e079f77c5d3..68020e08c80 100644 --- a/python/openmldb_sdk/openmldb/sdk/sdk.py +++ b/python/openmldb_sdk/openmldb/sdk/sdk.py @@ -71,6 +71,10 @@ def init(self): if 'maxSqlCacheSize' in self.options_map: options.max_sql_cache_size = int( self.options_map['maxSqlCacheSize']) + if 'user' in self.options_map: + options.user = self.options_map['user'] + if 'password' in self.options_map: + options.password = self.options_map['password'] self.sdk = sql_router_sdk.NewClusterSQLRouter( options diff --git a/python/openmldb_sdk/tests/sqlalchemy_api_test.py b/python/openmldb_sdk/tests/sqlalchemy_api_test.py index 545df92be59..b5c4dbd7b50 100644 --- a/python/openmldb_sdk/tests/sqlalchemy_api_test.py +++ b/python/openmldb_sdk/tests/sqlalchemy_api_test.py @@ -57,6 +57,7 @@ def test_select(self): assert 'first' in list(row) assert 100 in list(row) + @pytest.mark.skip(reason="test may fail to init") def test_request_timeout(self): self.connection.execute( "insert into test_table (y, x) values(400, 'a'),(401,'b'),(402, 'c');" diff --git a/python/openmldb_tool/diagnostic_tool/collector.py b/python/openmldb_tool/diagnostic_tool/collector.py index 7e143025e11..41403061610 100644 --- a/python/openmldb_tool/diagnostic_tool/collector.py +++ b/python/openmldb_tool/diagnostic_tool/collector.py @@ -115,7 +115,8 @@ def get_spark_home(server_info: ServerInfo): tm_conf_path = server_info.conf_path_pair("")[0] config_name = "spark.home=" log.debug("get %s from %s", config_name, tm_conf_path) - grep_str, _ = server_info.cmd_on_host(f"grep {config_name} {tm_conf_path}") + # last one option + grep_str, _ = server_info.cmd_on_host(f"grep {config_name} {tm_conf_path} | tail -n 1") if not grep_str: # TODO(hw):no config in file, get env SPARK_HOME? diff --git a/python/openmldb_tool/diagnostic_tool/connector.py b/python/openmldb_tool/diagnostic_tool/connector.py index 87b28a8932a..5c69ad9a1d4 100644 --- a/python/openmldb_tool/diagnostic_tool/connector.py +++ b/python/openmldb_tool/diagnostic_tool/connector.py @@ -23,6 +23,8 @@ 'cluster', '127.0.0.1:2181/openmldb', 'Cluster addr, format: [,]/.', short_name='c') flags.DEFINE_bool('sdk_log', False, 'print sdk log(pysdk&zk&glog), default is False.') +flags.DEFINE_string('user', 'root', 'the username to connect OpenMLDB') +flags.DEFINE_string('password', '', 'config the password') FLAGS = flags.FLAGS @@ -38,6 +40,9 @@ def __init__(self): if not FLAGS.sdk_log: url += '&zkLogLevel=0&glogLevel=2' logging.getLogger('OpenMLDB_sdk').setLevel(logging.WARNING) + url += '&user=' + FLAGS.user + if FLAGS.password != '': + url += '&password=' + FLAGS.password self.engine = db.create_engine(url) self.conn = self.engine.connect() diff --git a/release/conf/openmldb-env.sh b/release/conf/openmldb-env.sh index 5ba917c49e7..3b4b83dd5ef 100644 --- a/release/conf/openmldb-env.sh +++ b/release/conf/openmldb-env.sh @@ -2,15 +2,12 @@ export OPENMLDB_VERSION=0.8.4 # openmldb mode: standalone / cluster export OPENMLDB_MODE=${OPENMLDB_MODE:=cluster} -# tablet port -export OPENMLDB_TABLET_PORT=10921 -# nameserver port -export OPENMLDB_NAMESERVER_PORT=7527 -# taskmanager port -export OPENMLDB_TASKMANAGER_PORT=9902 -# apiserver port -export OPENMLDB_APISERVER_PORT=9080 - +# openmldb root path +export OPENMLDB_HOME= +# the root path of openmldb spark release, default is $OPENMLDB_HOME/spark +# if not exists, download from online +export SPARK_HOME= +export RUNNER_EXISTING_SPARK_HOME= # if OPENMLDB_USE_EXISTING_ZK_CLUSTER is set, will use existing zk cluster export OPENMLDB_USE_EXISTING_ZK_CLUSTER=false # the root path of zookeeper release, default is $OPENMLDB_HOME/zookeeper @@ -20,17 +17,26 @@ export OPENMLDB_ZK_HOME= export OPENMLDB_ZK_CLUSTER= # zookeeper root path export OPENMLDB_ZK_ROOT_PATH=/openmldb + +export OPENMLDB_FORCE_LOCAL=false + +export RUNNER_JAVA_HOME= + +# if CLEAR_OPENMLDB_INSTALL_DIR is set, all files in the WORKDIR will be deleted when running sbin/clear-all.sh +export CLEAR_OPENMLDB_INSTALL_DIR=false + +# tablet port +export OPENMLDB_TABLET_PORT=10921 +# nameserver port +export OPENMLDB_NAMESERVER_PORT=7527 +# taskmanager port +export OPENMLDB_TASKMANAGER_PORT=9902 +# apiserver port +export OPENMLDB_APISERVER_PORT=9080 + # zookeeper client port, clientPort=2181 in zoo.cfg export OPENMLDB_ZK_CLUSTER_CLIENT_PORT=2181 # zookeeper peer port, which is the first port in this config server.1=zoo1:2888:3888 in zoo.cfg export OPENMLDB_ZK_CLUSTER_PEER_PORT=2888 # zookeeper election port, which is the second port in this config server.1=zoo1:2888:3888 in zoo.cfg export OPENMLDB_ZK_CLUSTER_ELECTION_PORT=3888 - -# openmldb root path -export OPENMLDB_HOME= -# the root path of openmldb spark release, default is $OPENMLDB_HOME/spark -# if not exists, download from online -export SPARK_HOME= -# if CLEAR_OPENMLDB_INSTALL_DIR is set, all files in the WORKDIR will be deleted when running sbin/clear-all.sh -export CLEAR_OPENMLDB_INSTALL_DIR=false diff --git a/release/sbin/deploy-all.sh b/release/sbin/deploy-all.sh index 3a4f101b15b..ddfc7e712cd 100755 --- a/release/sbin/deploy-all.sh +++ b/release/sbin/deploy-all.sh @@ -33,7 +33,7 @@ distribute() { type=$4 fi local use_ssh=true - if [[ $host = "localhost" || $host = "127.0.0.1" ]]; then + if [[ "$OPENMLDB_FORCE_LOCAL" = true || "$host" = "localhost" || "$host" = "127.0.0.1" ]]; then use_ssh=false if [[ "$dest" = "$src" ]]; then echo "skip rsync as dest=src: $dest" @@ -56,7 +56,10 @@ distribute() { else if [[ "$type" = "taskmanager" ]]; then dir_list=(bin sbin conf taskmanager) - if [[ "$use_ssh" = true ]]; then + if [[ -n "$RUNNER_EXISTING_SPARK_HOME" ]]; then + echo "use existing spark $RUNNER_EXISTING_SPARK_HOME on $host, skip deploy spark" + elif [[ "$use_ssh" = true ]]; then + run_auto "$host" "mkdir -p ${SPARK_HOME} > /dev/null 2>&1" rsync -arz "${SPARK_HOME}/" "$host:${SPARK_HOME}/" fi else @@ -146,6 +149,10 @@ function download_spark { # deploy taskmanagers downloaded=false +if [[ -n "${RUNNER_EXISTING_SPARK_HOME}" ]]; then + echo "use $RUNNER_EXISTING_SPARK_HOME, skip download openmldbspark" + downloaded=true +fi for line in $(parse_host conf/hosts taskmanager) do if ! $downloaded; then @@ -158,7 +165,7 @@ do echo "deploy taskmanager to $host:$port $dir" distribute "$host" "$dir" "$home" taskmanager - cmd="cd $dir && OPENMLDB_VERSION=${OPENMLDB_VERSION} SPARK_HOME=${SPARK_HOME} OPENMLDB_HOST=$host OPENMLDB_TASKMANAGER_PORT=$port OPENMLDB_ZK_CLUSTER=${OPENMLDB_ZK_CLUSTER} OPENMLDB_ZK_ROOT_PATH=${OPENMLDB_ZK_ROOT_PATH} sbin/deploy.sh taskmanager" + cmd="cd $dir && SPARK_HOME=${SPARK_HOME} OPENMLDB_HOST=$host OPENMLDB_TASKMANAGER_PORT=$port OPENMLDB_ZK_CLUSTER=${OPENMLDB_ZK_CLUSTER} OPENMLDB_ZK_ROOT_PATH=${OPENMLDB_ZK_ROOT_PATH} sbin/deploy.sh taskmanager" run_auto "$host" "$cmd" done diff --git a/release/sbin/init.sh b/release/sbin/init.sh index b73ab226b81..1b20442bb48 100755 --- a/release/sbin/init.sh +++ b/release/sbin/init.sh @@ -90,7 +90,7 @@ function parse_host { run_auto() { local host=$1 local cmd=$2 - if [[ $host = "localhost" || $host = "127.0.0.1" ]]; then + if [[ "$OPENMLDB_FORCE_LOCAL" = true || "$host" = "localhost" || "$host" = "127.0.0.1" ]]; then local cur_dir cur_dir=$(pwd) bash -c "$cmd" @@ -105,7 +105,11 @@ if [ -z "${OPENMLDB_HOME}" ]; then export OPENMLDB_HOME fi -if [ -z "${SPARK_HOME}" ]; then +if [ -n "$RUNNER_EXISTING_SPARK_HOME" ]; then + echo "use existing spark $RUNNER_EXISTING_SPARK_HOME on runner, overwrite SPARK_HOME" + SPARK_HOME="$RUNNER_EXISTING_SPARK_HOME" + export SPARK_HOME +elif [ -z "${SPARK_HOME}" ]; then SPARK_HOME=${OPENMLDB_HOME}/spark export SPARK_HOME fi diff --git a/release/sbin/openmldb-cli.sh b/release/sbin/openmldb-cli.sh index 2102990164a..19bd5160a7b 100755 --- a/release/sbin/openmldb-cli.sh +++ b/release/sbin/openmldb-cli.sh @@ -20,7 +20,7 @@ sbin="$(cd "$(dirname "$0")" || exit 1; pwd)" . "$home"/conf/openmldb-env.sh . "$sbin"/init.sh cd "$home" || exit 1 - +echo "${OPENMLDB_MODE} ${OPENMLDB_ZK_CLUSTER} ${OPENMLDB_ZK_ROOT_PATH}" if [[ -n "$OPENMLDB_MODE" && "$OPENMLDB_MODE" = "cluster" ]]; then bin/openmldb --zk_cluster="${OPENMLDB_ZK_CLUSTER}" --zk_root_path="${OPENMLDB_ZK_ROOT_PATH}" --role=sql_client "$@" else diff --git a/release/sbin/start-taskmanagers.sh b/release/sbin/start-taskmanagers.sh index b6873c33089..322824dfbbf 100755 --- a/release/sbin/start-taskmanagers.sh +++ b/release/sbin/start-taskmanagers.sh @@ -38,11 +38,13 @@ else echo "start taskmanager in $dir with endpoint $host:$port " cmd="cd $dir && SPARK_HOME=${SPARK_HOME} bin/start.sh start taskmanager $*" - run_auto "$host" "$cmd" - - # Print the log of taskmanager if fail - #cmd="cd $dir && cat taskmanager/bin/logs/taskmanager.log" - #run_auto "$host" "$cmd" + # special for java + pre="" + if [[ -n $RUNNER_JAVA_HOME ]]; then + echo "overwrite java env by RUNNER_JAVA_HOME:$RUNNER_JAVA_HOME" + pre="export JAVA_HOME=$RUNNER_JAVA_HOME && export PATH=$JAVA_HOME/bin:$PATH &&" + fi + run_auto "$host" "$pre $cmd" done IFS="$old_IFS" fi diff --git a/release/sbin/start-zks.sh b/release/sbin/start-zks.sh index c13b762be90..775d52715ac 100755 --- a/release/sbin/start-zks.sh +++ b/release/sbin/start-zks.sh @@ -33,6 +33,12 @@ do echo "start zookeeper in $dir with endpoint $host:$port " cmd="cd $dir && bin/zkServer.sh start" - run_auto "$host" "$cmd" + # special for java + pre="" + if [[ -n $RUNNER_JAVA_HOME ]]; then + echo "overwrite java env by RUNNER_JAVA_HOME:$RUNNER_JAVA_HOME" + pre="export JAVA_HOME=$RUNNER_JAVA_HOME && export PATH=$JAVA_HOME/bin:$PATH &&" + fi + run_auto "$host" "$pre $cmd" done -IFS="$old_IFS" \ No newline at end of file +IFS="$old_IFS" diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d24b41fef9f..a5c5e642122 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -60,7 +60,7 @@ endfunction(compile_lib) set(TEST_LIBS openmldb_test_base apiserver nameserver tablet query_response_time openmldb_sdk - openmldb_catalog client zk_client storage schema replica base openmldb_codec openmldb_proto log + openmldb_catalog client zk_client storage schema replica openmldb_codec base openmldb_proto log common zookeeper_mt tcmalloc_minimal ${RocksDB_LIB} ${VM_LIBS} ${LLVM_LIBS} ${ZETASQL_LIBS} ${BRPC_LIBS}) if(CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "9.1") # GNU implementation prior to 9.1 requires linking with -lstdc++fs diff --git a/src/apiserver/api_server_impl.cc b/src/apiserver/api_server_impl.cc index cb13414798f..34b0b17fc97 100644 --- a/src/apiserver/api_server_impl.cc +++ b/src/apiserver/api_server_impl.cc @@ -30,13 +30,38 @@ namespace openmldb { namespace apiserver { +std::string PrintJsonValue(const Value& v) { + if (v.IsNull()) { + return "null"; + } + if (v.IsBool()) { + return v.GetBool() ? "true" : "false"; + } + if (v.IsInt()) { + return std::to_string(v.GetInt()); + } + if (v.IsInt64()) { + return std::to_string(v.GetInt64()); + } + if (v.IsFloat()) { + return std::to_string(v.GetFloat()); + } + if (v.IsDouble()) { + return std::to_string(v.GetDouble()); + } + if (v.IsString()) { + return v.GetString(); + } + return "unknown"; +} + APIServerImpl::APIServerImpl(const std::string& endpoint) : md_recorder_("rpc_server_" + endpoint.substr(endpoint.find(":") + 1), "http_method", {"method"}), provider_("rpc_server_" + endpoint.substr(endpoint.find(":") + 1)) {} APIServerImpl::~APIServerImpl() = default; -bool APIServerImpl::Init(const sdk::ClusterOptions& options) { +bool APIServerImpl::Init(const std::shared_ptr<::openmldb::sdk::SQLRouterOptions>& options) { // If cluster sdk is needed, use ptr, don't own it. SQLClusterRouter owns it. auto cluster_sdk = new ::openmldb::sdk::ClusterSDK(options); bool ok = cluster_sdk->Init(); @@ -129,11 +154,10 @@ void APIServerImpl::RegisterQuery() { } auto db = db_it->second; - QueryReq req; JsonReader query_reader(req_body.to_string().c_str()); - query_reader >> req; - if (!query_reader) { - writer << resp.Set("Json parse failed, " + req_body.to_string()); + QueryReq req(query_reader); + if (!req.status().ok()) { + writer << resp.Set("Json parse failed, " + req.status().ToString()); return; } auto mode = boost::to_lower_copy(req.mode); @@ -199,15 +223,17 @@ absl::Status APIServerImpl::JsonArray2SQLRequestRow(const Value& non_common_cols for (decltype(sch->GetColumnCnt()) i = 0; i < sch->GetColumnCnt(); ++i) { if (sch->IsConstant(i)) { if (!AppendJsonValue(common_cols_v[common_idx], sch->GetColumnType(i), sch->IsColumnNotNull(i), row)) { - return absl::InvalidArgumentError( - absl::StrCat("trans const ", sch->GetColumnName(i), "[", sch->GetColumnType(i), "] failed")); + return absl::InvalidArgumentError(absl::StrCat("trans const failed on ", sch->GetColumnName(i), "(", + sch->GetColumnType(i), + "): ", PrintJsonValue(common_cols_v[common_idx]))); } ++common_idx; } else { if (!AppendJsonValue(non_common_cols_v[non_common_idx], sch->GetColumnType(i), sch->IsColumnNotNull(i), row)) { return absl::InvalidArgumentError( - absl::StrCat("trans ", sch->GetColumnName(i), "[", sch->GetColumnType(i), "] failed")); + absl::StrCat("trans failed on ", sch->GetColumnName(i), "(", sch->GetColumnType(i), + "): ", PrintJsonValue(non_common_cols_v[non_common_idx]))); } ++non_common_idx; } @@ -326,8 +352,9 @@ absl::Status APIServerImpl::JsonMap2SQLRequestRow(const Value& non_common_cols_v for (decltype(sch->GetColumnCnt()) i = 0; i < sch->GetColumnCnt(); ++i) { if (sch->IsConstant(i)) { if (!AppendJsonValue(common_cols_v[common_idx], sch->GetColumnType(i), sch->IsColumnNotNull(i), row)) { - return absl::InvalidArgumentError( - absl::StrCat("trans const ", sch->GetColumnName(i), "[", sch->GetColumnType(i), "] failed")); + return absl::InvalidArgumentError(absl::StrCat("trans const failed on ", sch->GetColumnName(i), "(", + sch->GetColumnType(i), + "): ", PrintJsonValue(common_cols_v[common_idx]))); } ++common_idx; } else { @@ -336,8 +363,8 @@ absl::Status APIServerImpl::JsonMap2SQLRequestRow(const Value& non_common_cols_v return absl::InvalidArgumentError("can't find " + sch->GetColumnName(i)); } if (!AppendJsonValue(v->value, sch->GetColumnType(i), sch->IsColumnNotNull(i), row)) { - return absl::InvalidArgumentError( - absl::StrCat("trans ", sch->GetColumnName(i), "[", sch->GetColumnType(i), "] failed")); + return absl::InvalidArgumentError(absl::StrCat("trans failed on ", sch->GetColumnName(i), "(", + sch->GetColumnType(i), "): ", PrintJsonValue(v->value))); } } } @@ -381,7 +408,7 @@ void APIServerImpl::RegisterPut() { std::string insert_placeholder = "insert into " + table + " values(" + holders + ");"; auto row = sql_router_->GetInsertRow(db, insert_placeholder, &status); if (!row) { - writer << resp.Set(status.msg); + writer << resp.Set(status.code, status.msg); return; } auto schema = row->GetSchema(); @@ -408,7 +435,8 @@ void APIServerImpl::RegisterPut() { for (int i = 0; i < cnt; ++i) { if (!AppendJsonValue(arr[i], schema->GetColumnType(i), schema->IsColumnNotNull(i), row)) { - writer << resp.Set("convertion failed for col " + schema->GetColumnName(i)); + writer << resp.Set(absl::StrCat("convertion failed on col ", schema->GetColumnName(i), "[", + schema->GetColumnType(i), "] with value ", arr[i].GetString())); return; } } @@ -736,25 +764,30 @@ std::string APIServerImpl::InnerTypeTransform(const std::string& s) { return out; } -JsonReader& operator&(JsonReader& ar, QueryReq& s) { // NOLINT - ar.StartObject(); +JsonReader& QueryReq::parse(JsonReader& ar) { // NOLINT + RETURN_AR_IF_ERROR(ar.StartObject(), "req is not object"); // mode is not optional - ar.Member("mode") & s.mode; - ar.Member("sql") & s.sql; + RETURN_AR_IF_ERROR(ar.Member("mode") & mode, "mode parse failed"); + + RETURN_AR_IF_ERROR(ar.Member("sql") & sql, "sql parse failed"); + if (ar.HasMember("timeout")) { - ar.Member("timeout") & s.timeout; + RETURN_AR_IF_ERROR(ar.Member("timeout") & timeout, "timeout parse failed"); } + if (ar.HasMember("input")) { - ar.Member("input") & s.parameter; + RETURN_AR_IF_ERROR(parse(ar.Member("input"), parameter), "input parse failed"); } if (ar.HasMember("write_nan_and_inf_null")) { - ar.Member("write_nan_and_inf_null") & s.write_nan_and_inf_null; + RETURN_AR_IF_ERROR(ar.Member("write_nan_and_inf_null") & write_nan_and_inf_null, + "write_nan_and_inf_null parse failed"); } - return ar.EndObject(); + RETURN_AR_IF_ERROR(ar.EndObject(), "req object end error"); + return ar; } -JsonReader& operator&(JsonReader& ar, std::shared_ptr& parameter) { // NOLINT - ar.StartObject(); +JsonReader& QueryReq::parse(JsonReader& ar, std::shared_ptr& parameter) { // NOLINT + RETURN_AR_IF_ERROR(ar.StartObject(), "input is not object"); if (!ar.HasMember("schema") || !ar.HasMember("data")) return ar.EndObject(); @@ -789,10 +822,12 @@ JsonReader& operator&(JsonReader& ar, std::shared_ptrset_type(::hybridse::type::kTimestamp); } else { + status_.Update(absl::InvalidArgumentError("invalid type " + type)); return ar; } } - ar.EndArray(); // end "schema" + // end "schema" + RETURN_AR_IF_ERROR(ar.EndArray(), "schema parse failed"); } int32_t str_length = 0; @@ -800,32 +835,33 @@ JsonReader& operator&(JsonReader& ar, std::shared_ptr(size) != schema.size()) return ar; + RETURN_AR_IF_NOT_OK(static_cast(size) == schema.size(), ar, + absl::StrCat("data size ", size, " != schema size ", schema.size())); for (auto col = schema.begin(); col != schema.end(); col++) { if (col->type() == ::hybridse::type::kVarchar) { std::string str; - ar& str; + ar & str; str_length += str.length(); } else { ar.Next(); } } - ar.EndArray(); // end first iter "data" + // end first iter "data" + RETURN_AR_IF_ERROR(ar.EndArray(), "data array end error"); } { - ::hybridse::sdk::SchemaImpl* schema_impl = new ::hybridse::sdk::SchemaImpl(schema); - parameter.reset(new openmldb::sdk::SQLRequestRow(std::shared_ptr<::hybridse::sdk::Schema>(schema_impl), - std::set({}))); - + parameter.reset(new openmldb::sdk::SQLRequestRow( + std::shared_ptr<::hybridse::sdk::Schema>(new ::hybridse::sdk::SchemaImpl(schema)), {})); ar.Member("data"); size_t size; ar.StartArray(&size); // start second iter "data" - if (!parameter->Init(str_length)) return ar; + RETURN_AR_IF_NOT_OK(parameter->Init(str_length), ar, "init parameter row failed"); - for (auto col = schema.begin(); col != schema.end(); col++) { + for (size_t i = 0; i < size; i++) { + auto& col = schema.Get(i); bool ok; - switch (col->type()) { + switch (col.type()) { case ::hybridse::type::kBool: { bool b; ar& b; @@ -874,15 +910,18 @@ JsonReader& operator&(JsonReader& ar, std::shared_ptrBuild()) return ar; - - ar.EndArray(); // end second iter "data" + RETURN_AR_IF_NOT_OK(parameter->Build(), ar, "build parameter failed"); + // end second iter "data" + RETURN_AR_IF_ERROR(ar.EndArray(), "data array end error"); } - return ar.EndObject(); + RETURN_AR_IF_ERROR(ar.EndObject(), "input object end error"); + return ar; } void WriteSchema(JsonWriter& ar, const std::string& name, const hybridse::sdk::Schema& schema, // NOLINT diff --git a/src/apiserver/api_server_impl.h b/src/apiserver/api_server_impl.h index fc8e8022417..353ff371996 100644 --- a/src/apiserver/api_server_impl.h +++ b/src/apiserver/api_server_impl.h @@ -49,7 +49,7 @@ class APIServerImpl : public APIServer { public: explicit APIServerImpl(const std::string& endpoint); ~APIServerImpl() override; - bool Init(const sdk::ClusterOptions& options); + bool Init(const std::shared_ptr<::openmldb::sdk::SQLRouterOptions>& options); bool Init(::openmldb::sdk::DBSDK* cluster); void Process(google::protobuf::RpcController* cntl_base, const HttpRequest*, HttpResponse*, google::protobuf::Closure* done) override; @@ -71,15 +71,12 @@ class APIServerImpl : public APIServer { void ExecuteProcedure(bool has_common_col, const InterfaceProvider::Params& param, const butil::IOBuf& req_body, JsonWriter& writer); // NOLINT - static absl::Status JsonArray2SQLRequestRow(const Value& non_common_cols_v, - const Value& common_cols_v, + static absl::Status JsonArray2SQLRequestRow(const Value& non_common_cols_v, const Value& common_cols_v, std::shared_ptr row); - static absl::Status JsonMap2SQLRequestRow(const Value& non_common_cols_v, - const Value& common_cols_v, + static absl::Status JsonMap2SQLRequestRow(const Value& non_common_cols_v, const Value& common_cols_v, std::shared_ptr row); template - static bool AppendJsonValue(const Value& v, hybridse::sdk::DataType type, bool is_not_null, - T row); + static bool AppendJsonValue(const Value& v, hybridse::sdk::DataType type, bool is_not_null, T row); // may get segmentation fault when throw boost::bad_lexical_cast, so we use std::from_chars template @@ -101,17 +98,39 @@ class APIServerImpl : public APIServer { ::openmldb::sdk::DBSDK* cluster_sdk_ = nullptr; }; +#define RETURN_AR_IF_ERROR(expr, msg) \ + do { \ + auto& _ar = (expr); \ + if (!_ar) { \ + status_.Update(absl::InvalidArgumentError(msg)); \ + return _ar; \ + } \ + } while (0) +#define RETURN_AR_IF_NOT_OK(expr, _ar, msg) \ + do { \ + auto _expr = (expr); \ + if (!_expr) { \ + status_.Update(absl::InvalidArgumentError(msg)); \ + return _ar; \ + } \ + } while (0) struct QueryReq { std::string mode; int timeout = -1; // only for offline jobs std::string sql; std::shared_ptr parameter; bool write_nan_and_inf_null = false; -}; -JsonReader& operator&(JsonReader& ar, QueryReq& s); // NOLINT + QueryReq(JsonReader& ar) { parse(ar); } // NOLINT + absl::Status status() { return status_; } -JsonReader& operator&(JsonReader& ar, std::shared_ptr& parameter); // NOLINT + private: + JsonReader& parse(JsonReader& ar); // NOLINT + // we want to store errors when parsing, so make this method in class + JsonReader& parse(JsonReader& ar, std::shared_ptr& parameter); // NOLINT + private: + absl::Status status_; +}; struct ExecSPResp { ExecSPResp() = default; @@ -127,7 +146,8 @@ struct ExecSPResp { void WriteSchema(JsonWriter& ar, const std::string& name, const hybridse::sdk::Schema& schema, // NOLINT bool only_const); -void WriteValue(JsonWriter& ar, std::shared_ptr rs, int i, bool write_nan_and_inf_null); // NOLINT +void WriteValue(JsonWriter& ar, std::shared_ptr rs, int i, // NOLINT + bool write_nan_and_inf_null); // ExecSPResp reading is unsupported now, cuz we decode ResultSet with Schema here, it's irreversible JsonWriter& operator&(JsonWriter& ar, ExecSPResp& s); // NOLINT diff --git a/src/apiserver/api_server_test.cc b/src/apiserver/api_server_test.cc index 6abe8ddd051..45435b3b0e0 100644 --- a/src/apiserver/api_server_test.cc +++ b/src/apiserver/api_server_test.cc @@ -28,6 +28,9 @@ #include "rapidjson/rapidjson.h" #include "sdk/mini_cluster.h" +DEFINE_int32(zk_port, 6181, "zk port"); +DEFINE_string(api_server_port, "8084", "api server port"); + namespace openmldb::apiserver { class APIServerTestEnv : public testing::Environment { @@ -41,17 +44,18 @@ class APIServerTestEnv : public testing::Environment { ::hybridse::vm::Engine::InitializeGlobalLLVM(); FLAGS_zk_session_timeout = 100000; - mc = std::make_shared(6181); + mc = std::make_shared(FLAGS_zk_port); ASSERT_TRUE(mc->SetUp()) << "Fail to set up mini cluster"; - sdk::ClusterOptions cluster_options; - cluster_options.zk_cluster = mc->GetZkCluster(); - cluster_options.zk_path = mc->GetZkPath(); - // Owned by queue_svc + auto cluster_options = std::make_shared();; + cluster_options->zk_cluster = mc->GetZkCluster(); + cluster_options->zk_path = mc->GetZkPath(); + // Owned by server_process + cluster_sdk = new ::openmldb::sdk::ClusterSDK(cluster_options); ASSERT_TRUE(cluster_sdk->Init()) << "Fail to connect to db"; - queue_svc = std::make_shared("127.0.0.1:8010"); // fake endpoint for metrics - ASSERT_TRUE(queue_svc->Init(cluster_sdk)); + server_process = std::make_shared("127.0.0.1:8010"); // fake endpoint for metrics + ASSERT_TRUE(server_process->Init(cluster_sdk)); sdk::SQLRouterOptions sql_opt; sql_opt.zk_session_timeout = 30000; @@ -61,11 +65,12 @@ class APIServerTestEnv : public testing::Environment { cluster_remote = sdk::NewClusterSQLRouter(sql_opt); // Http server set up - ASSERT_TRUE(server.AddService(queue_svc.get(), brpc::SERVER_DOESNT_OWN_SERVICE, "/* => Process") == 0) - << "Fail to add queue_svc"; + ASSERT_TRUE(server.AddService(server_process.get(), brpc::SERVER_DOESNT_OWN_SERVICE, "/* => Process") == 0) + << "Fail to add server_process"; // Start the server. - int api_server_port = 8010; + api_server_url = "http://127.0.0.1:" + FLAGS_api_server_port; + int api_server_port = std::stoi(FLAGS_api_server_port); brpc::ServerOptions server_options; // options.idle_timeout_sec = FLAGS_idle_timeout_s; ASSERT_TRUE(server.Start(api_server_port, &server_options) == 0) << "Fail to start HttpServer"; @@ -74,12 +79,11 @@ class APIServerTestEnv : public testing::Environment { ASSERT_TRUE(cluster_remote != nullptr); cluster_remote->ExecuteSQL("SET @@execute_mode='online';", &status); - db = "api_server_test"; - cluster_remote->DropDB(db, &status); + db = "api_server_test" + std::to_string(std::rand()); cluster_remote->CreateDB(db, &status); std::vector dbs; - ASSERT_TRUE(cluster_remote->ShowDB(&dbs, &status)); - ASSERT_TRUE(std::find(dbs.begin(), dbs.end(), db) != dbs.end()); + ASSERT_TRUE(cluster_remote->ShowDB(&dbs, &status)) << "Fail to show dbs"; + ASSERT_TRUE(std::find(dbs.begin(), dbs.end(), db) != dbs.end()) << "Fail to create db"; brpc::ChannelOptions options; options.protocol = "http"; @@ -91,6 +95,7 @@ class APIServerTestEnv : public testing::Environment { void TearDown() override { std::cout << "Environment TearDown!" << std::endl; + // just try to clean up hybridse::sdk::Status status; cluster_remote->ExecuteSQL("drop database " + db, &status); server.Stop(0); @@ -98,11 +103,14 @@ class APIServerTestEnv : public testing::Environment { mc->Close(); } - std::string db; ::openmldb::sdk::DBSDK* cluster_sdk = nullptr; std::shared_ptr mc; - std::shared_ptr queue_svc; + std::string api_server_url; + std::shared_ptr server_process; brpc::Server server; + std::string db; + + // http client for api server brpc::Channel http_channel; std::shared_ptr cluster_remote; @@ -236,6 +244,123 @@ TEST_F(APIServerTest, jsonFormat) { ASSERT_STREQ("[NaN,Infinity,-Infinity]", writer.GetString()); } +TEST_F(APIServerTest, reqParse) { + { + JsonReader query_reader(R"({ + "sql": "select c1, c2 from demo;", "mode": "online" + })"); + QueryReq req(query_reader); + ASSERT_TRUE(req.status().ok()) << req.status(); + ASSERT_EQ("online", req.mode); + ASSERT_EQ("select c1, c2 from demo;", req.sql); + } + + { + JsonReader query_reader(R"({ + "sql": "select c1, c2 from demo;", "mode": "offline", "timeout": 1000, "write_nan_and_inf_null": true + })"); + QueryReq req(query_reader); + ASSERT_TRUE(req.status().ok()) << req.status(); + ASSERT_EQ(1000, req.timeout); + ASSERT_TRUE(req.write_nan_and_inf_null); + } + + { + JsonReader query_reader(R"({ + "sql": "select c1, c2 from demo;", "mode": "offline", "timeout": 1000, "write_nan_and_inf_null": true, + "input": { + "schema": ["STRING", "INT"], + "data": ["bb", 1] + } + })"); + QueryReq req(query_reader); + ASSERT_TRUE(req.status().ok()) << req.status(); + ASSERT_EQ(2, req.parameter->GetSchema()->GetColumnCnt()); + ASSERT_EQ(hybridse::sdk::DataType::kTypeString, req.parameter->GetSchema()->GetColumnType(0)); + ASSERT_EQ(hybridse::sdk::DataType::kTypeInt32, req.parameter->GetSchema()->GetColumnType(1)); + } + + // failed cases + { + // different size of schema and data + JsonReader query_reader(R"({ + "sql": "select c1, c2 from demo;", "mode": "offline", + "input": { + "schema": ["STRING", "INT"], + "data": ["bb", 1, 2] + } + })"); + QueryReq req(query_reader); + ASSERT_FALSE(req.status().ok()) << req.status(); + ASSERT_STREQ("data size 3 != schema size 2", req.status().message().data()); + } + { + // invalid data format: string -> int + JsonReader query_reader(R"({ + "sql": "select c1, c2 from demo;", "mode": "offline", + "input": { + "schema": ["STRING", "INT"], + "data": ["bb", "1"] + } + })"); + QueryReq req(query_reader); + ASSERT_FALSE(req.status().ok()) << req.status(); + ASSERT_STREQ("append failed on 1 type 3", req.status().message().data()); + } + { + // invalid schema : FOO + JsonReader query_reader(R"({ + "sql": "select c1, c2 from demo;", "mode": "offline", + "input": { + "schema": ["STRING", "FOO"], + "data": ["bb", 1] + } + })"); + QueryReq req(query_reader); + ASSERT_FALSE(req.status().ok()) << req.status(); + ASSERT_STREQ("invalid type FOO", req.status().message().data()); + } + { + // mismatch on sql & parameter, won't fail here, but will fail in execute + JsonReader query_reader(R"({ + "sql": "select c1, c2 from demo;", "mode": "offline", + "input": { + "schema": ["STRING", "INT", "INT", "INT", "INT", "INT", "INT", "INT", "INT", "INT", "INT"], + "data": ["bb", 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + } + })"); + QueryReq req(query_reader); + ASSERT_TRUE(req.status().ok()) << req.status(); + } + { + // invalid json + JsonReader query_reader(R"({ + "sql": "select c1, c2 from demo;", "mode": "offline", + "input": { + "schema": ["STRING", "INT"], + "data": ["bb", "1" + } + })"); + QueryReq req(query_reader); + ASSERT_FALSE(req.status().ok()) << req.status(); + ASSERT_STREQ("req is not object", req.status().message().data()); + } + { + // valid json, but wrong format of field(timeout is int) + JsonReader query_reader(R"({ + "sql": "select c1, c2 from demo;", "mode": "offline", + "input": { + "schema": ["STRING", "INT"], + "data": ["bb", 1] + }, + "timeout": "1000" + })"); + QueryReq req(query_reader); + ASSERT_FALSE(req.status().ok()) << req.status(); + ASSERT_STREQ("timeout parse failed", req.status().message().data()); + } +} + TEST_F(APIServerTest, query) { const auto env = APIServerTestEnv::Instance(); @@ -250,7 +375,7 @@ TEST_F(APIServerTest, query) { { brpc::Controller cntl; cntl.http_request().set_method(brpc::HTTP_METHOD_POST); - cntl.http_request().uri() = "http://127.0.0.1:8010/dbs/" + env->db; + cntl.http_request().uri() = env->api_server_url + "/dbs/" + env->db; cntl.request_attachment().append(R"({ "sql": "select c1, c2 from demo;", "mode": "online" })"); @@ -306,7 +431,7 @@ TEST_F(APIServerTest, parameterizedQuery) { { brpc::Controller cntl; cntl.http_request().set_method(brpc::HTTP_METHOD_POST); - cntl.http_request().uri() = "http://127.0.0.1:8010/dbs/" + env->db; + cntl.http_request().uri() = env->api_server_url + "/dbs/" + env->db; cntl.request_attachment().append(R"({ "sql": "select c1, c2 from demo where c2 = ?;", "mode": "online", @@ -351,7 +476,7 @@ TEST_F(APIServerTest, parameterizedQuery) { { brpc::Controller cntl; cntl.http_request().set_method(brpc::HTTP_METHOD_POST); - cntl.http_request().uri() = "http://127.0.0.1:8010/dbs/" + env->db; + cntl.http_request().uri() = env->api_server_url + "/dbs/" + env->db; cntl.request_attachment().append(R"({ "sql": "select c1, c2 from demo where c2 = ? and c1 = ?;", "mode": "online", @@ -423,7 +548,7 @@ TEST_F(APIServerTest, invalidPut) { JsonReader reader(cntl.response_attachment().to_string().c_str()); reader >> resp; } - ASSERT_EQ(-1, resp.code); + ASSERT_EQ(2000, resp.code) << resp.msg; LOG(INFO) << resp.msg; // Invalid table @@ -437,7 +562,7 @@ TEST_F(APIServerTest, invalidPut) { JsonReader reader(cntl.response_attachment().to_string().c_str()); reader >> resp; } - ASSERT_EQ(-1, resp.code); + ASSERT_EQ(2000, resp.code) << resp.msg; LOG(INFO) << resp.msg; // Invalid pattern @@ -477,7 +602,7 @@ TEST_F(APIServerTest, validPut) { { brpc::Controller cntl; cntl.http_request().set_method(brpc::HTTP_METHOD_PUT); - cntl.http_request().uri() = "http://127.0.0.1:8010/dbs/" + env->db + "/tables/" + table; + cntl.http_request().uri() = env->api_server_url + "/dbs/" + env->db + "/tables/" + table; cntl.request_attachment().append( "{\"value\": [[\"foo\", 111, 1.4, \"2021-0 4-27\", 1620471840256, true, \"more str\", null]]}"); env->http_channel.CallMethod(NULL, &cntl, NULL, NULL, NULL); @@ -486,7 +611,7 @@ TEST_F(APIServerTest, validPut) { JsonReader reader(cntl.response_attachment().to_string().c_str()); reader >> resp; ASSERT_EQ(-1, resp.code) << resp.msg; - ASSERT_STREQ("convertion failed for col field4", resp.msg.c_str()); + ASSERT_STREQ("convertion failed on col field4[7] with value 2021-0 4-27", resp.msg.c_str()); } // valid data @@ -496,7 +621,7 @@ TEST_F(APIServerTest, validPut) { brpc::Controller cntl; cntl.http_request().set_method(brpc::HTTP_METHOD_PUT); - cntl.http_request().uri() = "http://127.0.0.1:8010/dbs/" + env->db + "/tables/" + table; + cntl.http_request().uri() = env->api_server_url + "/dbs/" + env->db + "/tables/" + table; cntl.request_attachment().append("{\"value\": [[\"" + key + "\", 111, 1.4, \"2021-04-27\", 1620471840256, true, \"more str\", null]]}"); env->http_channel.CallMethod(NULL, &cntl, NULL, NULL, NULL); @@ -531,7 +656,7 @@ TEST_F(APIServerTest, putCase1) { const auto env = APIServerTestEnv::Instance(); // create table - std::string table = "put"; + std::string table = "putCase1"; std::string ddl = "create table if not exists " + table + "(c1 string, " "c3 int, " @@ -544,7 +669,7 @@ TEST_F(APIServerTest, putCase1) { { brpc::Controller cntl; cntl.http_request().set_method(brpc::HTTP_METHOD_PUT); - cntl.http_request().uri() = "http://127.0.0.1:8010/dbs/" + env->db + "/tables/" + table; + cntl.http_request().uri() = env->api_server_url + "/dbs/" + env->db + "/tables/" + table; cntl.request_attachment().append(R"({ "value": [ ["", 111, 1620471840256] @@ -562,7 +687,7 @@ TEST_F(APIServerTest, putCase1) { { brpc::Controller cntl; cntl.http_request().set_method(brpc::HTTP_METHOD_PUT); - cntl.http_request().uri() = "http://127.0.0.1:8010/dbs/" + env->db + "/tables/" + table; + cntl.http_request().uri() = env->api_server_url + "/dbs/" + env->db + "/tables/" + table; cntl.request_attachment().append(R"({ "value": [ ["drop table test1;", 111, 1620471840256] @@ -580,7 +705,7 @@ TEST_F(APIServerTest, putCase1) { { brpc::Controller cntl; cntl.http_request().set_method(brpc::HTTP_METHOD_PUT); - cntl.http_request().uri() = "http://127.0.0.1:8010/dbs/" + env->db + "/tables/" + table; + cntl.http_request().uri() = env->api_server_url + "/dbs/" + env->db + "/tables/" + table; // Invalid timestamp cntl.request_attachment().append(R"({ "value": [ @@ -599,7 +724,7 @@ TEST_F(APIServerTest, putCase1) { { brpc::Controller cntl; cntl.http_request().set_method(brpc::HTTP_METHOD_PUT); - cntl.http_request().uri() = "http://127.0.0.1:8010/dbs/" + env->db + "/tables/" + table; + cntl.http_request().uri() = env->api_server_url + "/dbs/" + env->db + "/tables/" + table; cntl.request_attachment().append(R"({ "value": [ ["中文", 111, 1620471840256] @@ -618,7 +743,7 @@ TEST_F(APIServerTest, putCase1) { { brpc::Controller cntl; cntl.http_request().set_method(brpc::HTTP_METHOD_PUT); - cntl.http_request().uri() = "http://127.0.0.1:8010/dbs/" + env->db + "/tables/" + table; + cntl.http_request().uri() = env->api_server_url + "/dbs/" + env->db + "/tables/" + table; cntl.request_attachment().append(R"({ "value": [ [null, 111, 1620471840256] @@ -673,7 +798,7 @@ TEST_F(APIServerTest, procedure) { // show procedure brpc::Controller show_cntl; // default is GET - show_cntl.http_request().uri() = "http://127.0.0.1:8010/dbs/" + env->db + "/procedures/" + sp_name; + show_cntl.http_request().uri() = env->api_server_url + "/dbs/" + env->db + "/procedures/" + sp_name; env->http_channel.CallMethod(NULL, &show_cntl, NULL, NULL, NULL); ASSERT_FALSE(show_cntl.Failed()) << show_cntl.ErrorText(); LOG(INFO) << "get sp resp: " << show_cntl.response_attachment(); @@ -694,7 +819,7 @@ TEST_F(APIServerTest, procedure) { { brpc::Controller cntl; cntl.http_request().set_method(brpc::HTTP_METHOD_POST); - cntl.http_request().uri() = "http://127.0.0.1:8010/dbs/" + env->db + "/procedures/" + sp_name; + cntl.http_request().uri() = env->api_server_url + "/dbs/" + env->db + "/procedures/" + sp_name; cntl.request_attachment().append(R"({ "common_cols":["bb", 23, 1590738994000], "input": [[123, 5.1, 6.1, "2021-08-01"],[234, 5.2, 6.2, "2021-08-02"]], @@ -720,7 +845,7 @@ TEST_F(APIServerTest, procedure) { { brpc::Controller cntl; cntl.http_request().set_method(brpc::HTTP_METHOD_POST); - cntl.http_request().uri() = "http://127.0.0.1:8010/dbs/" + env->db + "/procedures/" + sp_name; + cntl.http_request().uri() = env->api_server_url + "/dbs/" + env->db + "/procedures/" + sp_name; cntl.request_attachment().append(R"({ "common_cols":["bb", 23, 1590738994000], "input": [[123, 5.1, 6.1, "2021-08-01"],[234, 5.2, 6.2, "2021-08-02"]], @@ -747,7 +872,7 @@ TEST_F(APIServerTest, procedure) { { brpc::Controller cntl; cntl.http_request().set_method(brpc::HTTP_METHOD_POST); - cntl.http_request().uri() = "http://127.0.0.1:8010/dbs/" + env->db + "/procedures/" + sp_name; + cntl.http_request().uri() = env->api_server_url + "/dbs/" + env->db + "/procedures/" + sp_name; cntl.request_attachment().append(R"({ "common_cols":["bb", 23, 1590738994000], "input": [[123, 5.1, 6.1, "20 21-08-01"],[234, 5.2, 6.2, "2021-08-0 2"]], @@ -792,7 +917,7 @@ TEST_F(APIServerTest, testResultType) { { brpc::Controller cntl; cntl.http_request().set_method(brpc::HTTP_METHOD_POST); - cntl.http_request().uri() = "http://127.0.0.1:8010/dbs/" + env->db + "/deployments/" + "d1"; + cntl.http_request().uri() = env->api_server_url + "/dbs/" + env->db + "/deployments/" + "d1"; cntl.request_attachment().append(R"({ "input": [ @@ -867,7 +992,7 @@ TEST_F(APIServerTest, no_common) { // show procedure brpc::Controller show_cntl; // default is GET - show_cntl.http_request().uri() = "http://127.0.0.1:8010/dbs/" + env->db + "/procedures/" + sp_name; + show_cntl.http_request().uri() = env->api_server_url + "/dbs/" + env->db + "/procedures/" + sp_name; env->http_channel.CallMethod(NULL, &show_cntl, NULL, NULL, NULL); ASSERT_FALSE(show_cntl.Failed()) << show_cntl.ErrorText(); LOG(INFO) << "get sp resp: " << show_cntl.response_attachment(); @@ -887,7 +1012,7 @@ TEST_F(APIServerTest, no_common) { { brpc::Controller cntl; cntl.http_request().set_method(brpc::HTTP_METHOD_POST); - cntl.http_request().uri() = "http://127.0.0.1:8010/dbs/" + env->db + "/deployments/" + sp_name; + cntl.http_request().uri() = env->api_server_url + "/dbs/" + env->db + "/deployments/" + sp_name; cntl.request_attachment().append(R"({ "input": [["bb", 23, 123, 5.1, 6.1, 1590738994000, "2021-08-01"], ["bb", 23, 234, 5.2, 6.2, 1590738994000, "2021-08-02"]], @@ -954,7 +1079,7 @@ TEST_F(APIServerTest, no_common_not_first_string) { // show procedure brpc::Controller show_cntl; // default is GET - show_cntl.http_request().uri() = "http://127.0.0.1:8010/dbs/" + env->db + "/procedures/" + sp_name; + show_cntl.http_request().uri() = env->api_server_url + "/dbs/" + env->db + "/procedures/" + sp_name; env->http_channel.CallMethod(NULL, &show_cntl, NULL, NULL, NULL); ASSERT_FALSE(show_cntl.Failed()) << show_cntl.ErrorText(); LOG(INFO) << "get sp resp: " << show_cntl.response_attachment(); @@ -974,7 +1099,7 @@ TEST_F(APIServerTest, no_common_not_first_string) { { brpc::Controller cntl; cntl.http_request().set_method(brpc::HTTP_METHOD_POST); - cntl.http_request().uri() = "http://127.0.0.1:8010/dbs/" + env->db + "/procedures/" + sp_name; + cntl.http_request().uri() = env->api_server_url + "/dbs/" + env->db + "/procedures/" + sp_name; cntl.request_attachment().append(R"({ "input": [[11, "bb", 23, 123, 5.1, 6.1, 1590738994000, "2021-08-01"], [11, "bb", 23, 234, 5.2, 6.2, 1590738994000, "2021-08-02"]], @@ -1006,7 +1131,7 @@ TEST_F(APIServerTest, no_common_not_first_string) { TEST_F(APIServerTest, getDBs) { const auto env = APIServerTestEnv::Instance(); std::default_random_engine e; - std::string db_name = "" + std::to_string(e() % 100000); // to avoid use exists db, e.g. api_server_test + std::string db_name = "getdb" + std::to_string(e() % 100000); // to avoid use exists db, e.g. api_server_test LOG(INFO) << "test on db " << db_name; std::set test_dbs = {db_name, "monkey", "shark", "zebra"}; @@ -1014,7 +1139,7 @@ TEST_F(APIServerTest, getDBs) { std::set exists_db_set; { brpc::Controller show_cntl; // default is GET - show_cntl.http_request().uri() = "http://127.0.0.1:8010/dbs"; + show_cntl.http_request().uri() = env->api_server_url + "/dbs"; env->http_channel.CallMethod(NULL, &show_cntl, NULL, NULL, NULL); rapidjson::Document document; if (document.Parse(show_cntl.response_attachment().to_string().c_str()).HasParseError()) { @@ -1042,10 +1167,10 @@ TEST_F(APIServerTest, getDBs) { env->cluster_remote->DropDB(db, &status); ASSERT_TRUE(env->cluster_remote->CreateDB(db, &status)); } - env->queue_svc->Refresh(); + env->server_process->Refresh(); brpc::Controller show_cntl; - show_cntl.http_request().uri() = "http://127.0.0.1:8010/dbs"; + show_cntl.http_request().uri() = env->api_server_url + "/dbs"; env->http_channel.CallMethod(NULL, &show_cntl, NULL, NULL, NULL); ASSERT_FALSE(show_cntl.Failed()) << show_cntl.ErrorText(); rapidjson::Document document; @@ -1071,15 +1196,15 @@ TEST_F(APIServerTest, getDBs) { TEST_F(APIServerTest, getTables) { const auto env = APIServerTestEnv::Instance(); std::default_random_engine e; - std::string db_name = "" + std::to_string(e() % 100000); // to avoid use db which has tables + std::string db_name = "gettable" + std::to_string(e() % 100000); // to avoid use db which has tables LOG(INFO) << "test on db " << db_name; // setup { hybridse::sdk::Status status; env->cluster_remote->CreateDB(db_name, &status); - env->queue_svc->Refresh(); + env->server_process->Refresh(); brpc::Controller show_cntl; // default is GET - show_cntl.http_request().uri() = "http://127.0.0.1:8010/dbs/" + db_name + "/tables"; + show_cntl.http_request().uri() = env->api_server_url + "/dbs/" + db_name + "/tables"; env->http_channel.CallMethod(NULL, &show_cntl, NULL, NULL, NULL); ASSERT_FALSE(show_cntl.Failed()) << show_cntl.ErrorText(); rapidjson::Document document; @@ -1110,7 +1235,7 @@ TEST_F(APIServerTest, getTables) { } { brpc::Controller show_cntl; // default is GET - show_cntl.http_request().uri() = "http://127.0.0.1:8010/dbs/" + db_name + "/tables"; + show_cntl.http_request().uri() = env->api_server_url + "/dbs/" + db_name + "/tables"; env->http_channel.CallMethod(NULL, &show_cntl, NULL, NULL, NULL); ASSERT_FALSE(show_cntl.Failed()) << show_cntl.ErrorText(); rapidjson::Document document; @@ -1135,7 +1260,7 @@ TEST_F(APIServerTest, getTables) { } { brpc::Controller show_cntl; // default is GET - show_cntl.http_request().uri() = "http://127.0.0.1:8010/dbs/db_not_exist/tables"; + show_cntl.http_request().uri() = env->api_server_url + "/dbs/db_not_exist/tables"; env->http_channel.CallMethod(NULL, &show_cntl, NULL, NULL, NULL); ASSERT_FALSE(show_cntl.Failed()) << show_cntl.ErrorText(); rapidjson::Document document; @@ -1148,7 +1273,7 @@ TEST_F(APIServerTest, getTables) { } for (auto table : tables) { brpc::Controller show_cntl; // default is GET - show_cntl.http_request().uri() = "http://127.0.0.1:8010/dbs/" + db_name + "/tables/" + table; + show_cntl.http_request().uri() = env->api_server_url + "/dbs/" + db_name + "/tables/" + table; env->http_channel.CallMethod(NULL, &show_cntl, NULL, NULL, NULL); ASSERT_FALSE(show_cntl.Failed()) << show_cntl.ErrorText(); rapidjson::Document document; @@ -1164,7 +1289,7 @@ TEST_F(APIServerTest, getTables) { } { brpc::Controller show_cntl; // default is GET - show_cntl.http_request().uri() = "http://127.0.0.1:8010/dbs/" + db_name + "/tables/not_exist"; + show_cntl.http_request().uri() = env->api_server_url + "/dbs/" + db_name + "/tables/not_exist"; env->http_channel.CallMethod(NULL, &show_cntl, NULL, NULL, NULL); ASSERT_FALSE(show_cntl.Failed()) << show_cntl.ErrorText(); rapidjson::Document document; @@ -1177,7 +1302,7 @@ TEST_F(APIServerTest, getTables) { } { brpc::Controller show_cntl; // default is GET - show_cntl.http_request().uri() = "http://127.0.0.1:8010/dbs/db_not_exist/tables/apple"; + show_cntl.http_request().uri() = env->api_server_url + "/dbs/db_not_exist/tables/apple"; env->http_channel.CallMethod(NULL, &show_cntl, NULL, NULL, NULL); ASSERT_FALSE(show_cntl.Failed()) << show_cntl.ErrorText(); rapidjson::Document document; @@ -1231,7 +1356,7 @@ TEST_F(APIServerTest, jsonInput) { // show procedure brpc::Controller show_cntl; // default is GET - show_cntl.http_request().uri() = "http://127.0.0.1:8010/dbs/" + env->db + "/procedures/" + sp_name; + show_cntl.http_request().uri() = env->api_server_url + "/dbs/" + env->db + "/procedures/" + sp_name; env->http_channel.CallMethod(NULL, &show_cntl, NULL, NULL, NULL); ASSERT_FALSE(show_cntl.Failed()) << show_cntl.ErrorText(); LOG(INFO) << "get sp resp: " << show_cntl.response_attachment(); @@ -1241,7 +1366,7 @@ TEST_F(APIServerTest, jsonInput) { { brpc::Controller cntl; cntl.http_request().set_method(brpc::HTTP_METHOD_POST); - cntl.http_request().uri() = "http://127.0.0.1:8010/dbs/" + env->db + "/deployments/" + sp_name; + cntl.http_request().uri() = env->api_server_url + "/dbs/" + env->db + "/deployments/" + sp_name; cntl.request_attachment().append(R"({ "input": [{"c1":"bb", "c3":23, "c4":123, "c5":5.1, "c6":6.1, "c7":1590738994000, "c8":"2021-08-01"}, {"c1":"bb", "c3":23, "c4":234, "c5":5.2, "c6":6.2, "c7":1590738994000, "c8":"2021-08-02"}] diff --git a/src/base/status_util.h b/src/base/status_util.h index acd36e399f5..1d0db238d61 100644 --- a/src/base/status_util.h +++ b/src/base/status_util.h @@ -154,6 +154,13 @@ LOG(WARNING) << "Status: " << _s->ToString(); \ } while (0) +#define PREPEND_AND_WARN(s, msg) \ + do { \ + ::hybridse::sdk::Status* _s = (s); \ + _s->Prepend((msg)); \ + LOG(WARNING) << "Status: " << _s->ToString(); \ + } while (0) + /// @brief s.msg += append_str, and warn it #define CODE_APPEND_AND_WARN(s, code, msg) \ do { \ diff --git a/src/catalog/client_manager.cc b/src/catalog/client_manager.cc index d85de6c6c95..2930f17925e 100644 --- a/src/catalog/client_manager.cc +++ b/src/catalog/client_manager.cc @@ -324,6 +324,25 @@ std::shared_ptr<::hybridse::vm::TableHandler> TabletAccessor::SubQuery(uint32_t } return async_table_handler; } + +void TabletsAccessor::AddTabletAccessor(std::shared_ptr accessor) { + if (!accessor) { + LOG(WARNING) << "Fail to add null tablet accessor"; + return; + } + auto iter = name_idx_map_.find(accessor->GetName()); + if (iter == name_idx_map_.cend()) { + accessors_.push_back(accessor); + name_idx_map_.insert(std::make_pair(accessor->GetName(), accessors_.size() - 1)); + posinfos_.push_back(std::vector({rows_cnt_})); + assign_accessor_idxs_.push_back(accessors_.size() - 1); + } else { + posinfos_[iter->second].push_back(rows_cnt_); + assign_accessor_idxs_.push_back(iter->second); + } + rows_cnt_++; +} + std::shared_ptr TabletsAccessor::SubQuery(uint32_t task_id, const std::string& db, const std::string& sql, const hybridse::codec::Row& row, @@ -331,6 +350,7 @@ std::shared_ptr TabletsAccessor::SubQuery(uint32_t tas return std::make_shared<::hybridse::vm::ErrorRowHandler>(::hybridse::common::kRpcError, "TabletsAccessor Unsupport SubQuery with request"); } + std::shared_ptr TabletsAccessor::SubQuery(uint32_t task_id, const std::string& db, const std::string& sql, const std::set& common_column_indices, @@ -350,6 +370,7 @@ std::shared_ptr TabletsAccessor::SubQuery(uint32_t t } return tables_handler; } + PartitionClientManager::PartitionClientManager(uint32_t pid, const std::shared_ptr& leader, const std::vector>& followers) : pid_(pid), leader_(leader), followers_(followers), rand_(0xdeadbeef) {} @@ -406,6 +427,29 @@ TableClientManager::TableClientManager(const ::openmldb::storage::TableSt& table } } +void TableClientManager::Show() const { + DLOG(INFO) << "show client manager "; + for (size_t id = 0; id < partition_managers_.size(); id++) { + auto pmg = std::atomic_load_explicit(&partition_managers_[id], std::memory_order_relaxed); + if (pmg) { + if (pmg->GetLeader()) { + DLOG(INFO) << "partition managers (pid, leader) " << id << ", " << pmg->GetLeader()->GetName(); + } else { + DLOG(INFO) << "partition managers (pid, leader) " << id << ", null leader"; + } + } else { + DLOG(INFO) << "partition managers (pid, leader) " << id << ", null mamanger"; + } + } +} + +std::shared_ptr TableClientManager::GetPartitionClientManager(uint32_t pid) const { + if (pid < partition_managers_.size()) { + return std::atomic_load_explicit(&partition_managers_[pid], std::memory_order_relaxed); + } + return std::shared_ptr(); +} + bool TableClientManager::UpdatePartitionClientManager(const ::openmldb::storage::PartitionSt& partition, const ClientManager& client_manager) { uint32_t pid = partition.GetPid(); @@ -429,6 +473,41 @@ bool TableClientManager::UpdatePartitionClientManager(const ::openmldb::storage: return true; } +std::shared_ptr TableClientManager::GetTablet(uint32_t pid) const { + auto partition_manager = GetPartitionClientManager(pid); + if (partition_manager) { + return partition_manager->GetLeader(); + } + return std::shared_ptr(); +} + +std::vector> TableClientManager::GetTabletFollowers(uint32_t pid) const { + auto partition_manager = GetPartitionClientManager(pid); + if (partition_manager) { + return partition_manager->GetFollowers(); + } + return {}; +} + +std::shared_ptr TableClientManager::GetTablet(std::vector pids) const { + auto tablets_accessor = std::make_shared(); + for (size_t idx = 0; idx < pids.size(); idx++) { + auto partition_manager = GetPartitionClientManager(pids[idx]); + if (partition_manager) { + auto leader = partition_manager->GetLeader(); + if (!leader) { + LOG(WARNING) << "fail to get TabletsAccessor, null tablet for pid " << pids[idx]; + return std::shared_ptr(); + } + tablets_accessor->AddTabletAccessor(partition_manager->GetLeader()); + } else { + LOG(WARNING) << "fail to get tablet: pid " << pids[idx] << " not exist"; + return std::shared_ptr(); + } + } + return tablets_accessor; +} + std::shared_ptr ClientManager::GetTablet(const std::string& name) const { std::lock_guard<::openmldb::base::SpinMutex> lock(mu_); auto it = clients_.find(name); diff --git a/src/catalog/client_manager.h b/src/catalog/client_manager.h index 10a9e7d60bf..7e913e68ce5 100644 --- a/src/catalog/client_manager.h +++ b/src/catalog/client_manager.h @@ -56,6 +56,7 @@ class TabletRowHandler : public ::hybridse::vm::RowHandler { ::hybridse::codec::Row row_; openmldb::RpcCallback* callback_; }; + class AsyncTableHandler : public ::hybridse::vm::MemTableHandler { public: explicit AsyncTableHandler(openmldb::RpcCallback* callback, @@ -91,6 +92,7 @@ class AsyncTableHandler : public ::hybridse::vm::MemTableHandler { openmldb::RpcCallback* callback_; bool request_is_common_; }; + class AsyncTablesHandler : public ::hybridse::vm::MemTableHandler { public: AsyncTablesHandler(); @@ -169,28 +171,14 @@ class TabletAccessor : public ::hybridse::vm::Tablet { std::string name_; std::shared_ptr<::openmldb::client::TabletClient> tablet_client_; }; + class TabletsAccessor : public ::hybridse::vm::Tablet { public: TabletsAccessor() : name_("TabletsAccessor"), rows_cnt_(0) {} ~TabletsAccessor() {} const std::string& GetName() const { return name_; } - void AddTabletAccessor(std::shared_ptr accessor) { - if (!accessor) { - LOG(WARNING) << "Fail to add null tablet accessor"; - return; - } - auto iter = name_idx_map_.find(accessor->GetName()); - if (iter == name_idx_map_.cend()) { - accessors_.push_back(accessor); - name_idx_map_.insert(std::make_pair(accessor->GetName(), accessors_.size() - 1)); - posinfos_.push_back(std::vector({rows_cnt_})); - assign_accessor_idxs_.push_back(accessors_.size() - 1); - } else { - posinfos_[iter->second].push_back(rows_cnt_); - assign_accessor_idxs_.push_back(iter->second); - } - rows_cnt_++; - } + void AddTabletAccessor(std::shared_ptr accessor); + std::shared_ptr SubQuery(uint32_t task_id, const std::string& db, const std::string& sql, const hybridse::codec::Row& row, const bool is_procedure, const bool is_debug) override; @@ -209,6 +197,7 @@ class TabletsAccessor : public ::hybridse::vm::Tablet { std::vector> posinfos_; std::map name_idx_map_; }; + class PartitionClientManager { public: PartitionClientManager(uint32_t pid, const std::shared_ptr& leader, @@ -235,65 +224,18 @@ class TableClientManager { TableClientManager(const ::openmldb::storage::TableSt& table_st, const ClientManager& client_manager); - void Show() const { - DLOG(INFO) << "show client manager "; - for (size_t id = 0; id < partition_managers_.size(); id++) { - auto pmg = std::atomic_load_explicit(&partition_managers_[id], std::memory_order_relaxed); - if (pmg) { - if (pmg->GetLeader()) { - DLOG(INFO) << "partition managers (pid, leader) " << id << ", " << pmg->GetLeader()->GetName(); - } else { - DLOG(INFO) << "partition managers (pid, leader) " << id << ", null leader"; - } - } else { - DLOG(INFO) << "partition managers (pid, leader) " << id << ", null mamanger"; - } - } - } - std::shared_ptr GetPartitionClientManager(uint32_t pid) const { - if (pid < partition_managers_.size()) { - return std::atomic_load_explicit(&partition_managers_[pid], std::memory_order_relaxed); - } - return std::shared_ptr(); - } + void Show() const; + + std::shared_ptr GetPartitionClientManager(uint32_t pid) const; bool UpdatePartitionClientManager(const ::openmldb::storage::PartitionSt& partition, const ClientManager& client_manager); - std::shared_ptr GetTablet(uint32_t pid) const { - auto partition_manager = GetPartitionClientManager(pid); - if (partition_manager) { - return partition_manager->GetLeader(); - } - return std::shared_ptr(); - } + std::shared_ptr GetTablet(uint32_t pid) const; - std::vector> GetTabletFollowers(uint32_t pid) const { - auto partition_manager = GetPartitionClientManager(pid); - if (partition_manager) { - return partition_manager->GetFollowers(); - } - return {}; - } + std::vector> GetTabletFollowers(uint32_t pid) const; - std::shared_ptr GetTablet(std::vector pids) const { - std::shared_ptr tablets_accessor = std::shared_ptr(new TabletsAccessor()); - for (size_t idx = 0; idx < pids.size(); idx++) { - auto partition_manager = GetPartitionClientManager(pids[idx]); - if (partition_manager) { - auto leader = partition_manager->GetLeader(); - if (!leader) { - LOG(WARNING) << "fail to get TabletsAccessor, null tablet for pid " << pids[idx]; - return std::shared_ptr(); - } - tablets_accessor->AddTabletAccessor(partition_manager->GetLeader()); - } else { - LOG(WARNING) << "fail to get tablet: pid " << pids[idx] << " not exist"; - return std::shared_ptr(); - } - } - return tablets_accessor; - } + std::shared_ptr GetTablet(std::vector pids) const; private: std::vector> partition_managers_; diff --git a/src/catalog/distribute_iterator.cc b/src/catalog/distribute_iterator.cc index b82afbb81fd..032d3ec75f2 100644 --- a/src/catalog/distribute_iterator.cc +++ b/src/catalog/distribute_iterator.cc @@ -423,7 +423,7 @@ const ::hybridse::codec::Row& RemoteWindowIterator::GetValue() { memcpy(copyed_row_data, slice_row.data(), sz); auto shared_slice = ::hybridse::base::RefCountedSlice::CreateManaged(copyed_row_data, sz); row_.Reset(shared_slice); - LOG(INFO) << "get value pk " << pk_ << " ts_key " << kv_it_->GetKey() << " ts " << ts_; + DLOG(INFO) << "get value pk " << pk_ << " ts_key " << kv_it_->GetKey() << " ts " << ts_; valid_value_ = true; return row_; } diff --git a/src/catalog/tablet_catalog.h b/src/catalog/tablet_catalog.h index 7d834147591..0a054a869db 100644 --- a/src/catalog/tablet_catalog.h +++ b/src/catalog/tablet_catalog.h @@ -175,6 +175,7 @@ class TabletTableHandler : public ::hybridse::vm::TableHandler, const std::vector &pks) override; inline uint32_t GetTid() { return table_st_.GetTid(); } + inline uint32_t GetPartitionNum() { return partition_num_; } void AddTable(std::shared_ptr<::openmldb::storage::Table> table); @@ -185,6 +186,8 @@ class TabletTableHandler : public ::hybridse::vm::TableHandler, bool Update(const ::openmldb::nameserver::TableInfo &meta, const ClientManager &client_manager, bool* index_updated); + std::shared_ptr GetTableClientManager() { return table_client_manager_; } + private: inline int32_t GetColumnIndex(const std::string &column) { auto it = types_.find(column); diff --git a/src/client/tablet_client.cc b/src/client/tablet_client.cc index 54b2a8c9cec..f6e8406add2 100644 --- a/src/client/tablet_client.cc +++ b/src/client/tablet_client.cc @@ -203,19 +203,21 @@ bool TabletClient::UpdateTableMetaForAddField(uint32_t tid, const std::vector>& dimensions, - int memory_usage_limit) { + int memory_usage_limit, bool put_if_absent) { + ::google::protobuf::RepeatedPtrField<::openmldb::api::Dimension> pb_dimensions; for (size_t i = 0; i < dimensions.size(); i++) { ::openmldb::api::Dimension* d = pb_dimensions.Add(); d->set_key(dimensions[i].first); d->set_idx(dimensions[i].second); } - return Put(tid, pid, time, base::Slice(value), &pb_dimensions, memory_usage_limit); + + return Put(tid, pid, time, base::Slice(value), &pb_dimensions, memory_usage_limit, put_if_absent); } base::Status TabletClient::Put(uint32_t tid, uint32_t pid, uint64_t time, const base::Slice& value, ::google::protobuf::RepeatedPtrField<::openmldb::api::Dimension>* dimensions, - int memory_usage_limit) { + int memory_usage_limit, bool put_if_absent) { ::openmldb::api::PutRequest request; if (memory_usage_limit < 0 || memory_usage_limit > 100) { return {base::ReturnCode::kError, absl::StrCat("invalid memory_usage_limit ", memory_usage_limit)}; @@ -227,6 +229,7 @@ base::Status TabletClient::Put(uint32_t tid, uint32_t pid, uint64_t time, const request.set_tid(tid); request.set_pid(pid); request.mutable_dimensions()->Swap(dimensions); + request.set_put_if_absent(put_if_absent); ::openmldb::api::PutResponse response; auto st = client_.SendRequestSt(&::openmldb::api::TabletServer_Stub::Put, &request, &response, FLAGS_request_timeout_ms, 1); @@ -812,28 +815,28 @@ bool TabletClient::Delete(uint32_t tid, uint32_t pid, const std::string& pk, con return true; } -base::Status TabletClient::Delete(uint32_t tid, uint32_t pid, const std::map& index_val, - const std::string& ts_name, const std::optional start_ts, const std::optional& end_ts) { +base::Status TabletClient::Delete(uint32_t tid, uint32_t pid, const sdk::DeleteOption& option, uint64_t timeout_ms) { ::openmldb::api::DeleteRequest request; ::openmldb::api::GeneralResponse response; request.set_tid(tid); request.set_pid(pid); - for (const auto& kv : index_val) { + if (option.idx.has_value()) { auto dimension = request.add_dimensions(); - dimension->set_idx(kv.first); - dimension->set_key(kv.second); + dimension->set_idx(option.idx.value()); + dimension->set_key(option.key); } - if (start_ts.has_value()) { - request.set_ts(start_ts.value()); + if (option.start_ts.has_value()) { + request.set_ts(option.start_ts.value()); } - if (end_ts.has_value()) { - request.set_end_ts(end_ts.value()); + if (option.end_ts.has_value()) { + request.set_end_ts(option.end_ts.value()); } - if (!ts_name.empty()) { - request.set_ts_name(ts_name); + if (!option.ts_name.empty()) { + request.set_ts_name(option.ts_name); } + request.set_enable_decode_value(option.enable_decode_value); bool ok = client_.SendRequest(&::openmldb::api::TabletServer_Stub::Delete, &request, &response, - FLAGS_request_timeout_ms, 1); + timeout_ms, 1); if (!ok || response.code() != 0) { return {base::ReturnCode::kError, response.msg()}; } diff --git a/src/client/tablet_client.h b/src/client/tablet_client.h index b4866b77618..632f75f3510 100644 --- a/src/client/tablet_client.h +++ b/src/client/tablet_client.h @@ -31,6 +31,7 @@ #include "codec/schema_codec.h" #include "proto/tablet.pb.h" #include "rpc/rpc_client.h" +#include "sdk/option.h" namespace openmldb { @@ -76,27 +77,23 @@ class TabletClient : public Client { base::Status Put(uint32_t tid, uint32_t pid, uint64_t time, const std::string& value, const std::vector>& dimensions, - int memory_usage_limit = 0); + int memory_usage_limit = 0, bool put_if_absent = false); base::Status Put(uint32_t tid, uint32_t pid, uint64_t time, const base::Slice& value, ::google::protobuf::RepeatedPtrField<::openmldb::api::Dimension>* dimensions, - int memory_usage_limit = 0); + int memory_usage_limit = 0, bool put_if_absent = false); bool Get(uint32_t tid, uint32_t pid, const std::string& pk, uint64_t time, std::string& value, // NOLINT uint64_t& ts, // NOLINT std::string& msg); // NOLINT bool Get(uint32_t tid, uint32_t pid, const std::string& pk, uint64_t time, const std::string& idx_name, - std::string& value, // NOLINT - uint64_t& ts, // NOLINT - std::string& msg); // NOLINT + std::string& value, uint64_t& ts, std::string& msg); // NOLINT bool Delete(uint32_t tid, uint32_t pid, const std::string& pk, const std::string& idx_name, std::string& msg); // NOLINT - base::Status Delete(uint32_t tid, uint32_t pid, const std::map& index_val, - const std::string& ts_name, const std::optional start_ts, - const std::optional& end_ts); + base::Status Delete(uint32_t tid, uint32_t pid, const sdk::DeleteOption& option, uint64_t timeout_ms); bool Count(uint32_t tid, uint32_t pid, const std::string& pk, const std::string& idx_name, bool filter_expired_data, uint64_t& value, std::string& msg); // NOLINT diff --git a/src/cmd/display.h b/src/cmd/display.h index 714a9ca6a73..34e1f851e39 100644 --- a/src/cmd/display.h +++ b/src/cmd/display.h @@ -586,6 +586,21 @@ __attribute__((unused)) static void PrintProcedureInfo( sql = boost::regex_replace(sql, boost::regex(pattern_sp), "DEPLOY"); std::string pattern_blank = "(.*)(\\(.*\\) )(BEGIN )(.*)( END;)"; sql = boost::regex_replace(sql, boost::regex(pattern_blank), "$1$4"); + if (!sp_info.GetOption()->empty()) { + std::stringstream ss; + ss << " OPTIONS("; + for (auto iter = sp_info.GetOption()->begin(); iter != sp_info.GetOption()->end(); iter++) { + if (iter != sp_info.GetOption()->begin()) { + ss << ", "; + } + ss << absl::AsciiStrToUpper(iter->first) << "=\"" << iter->second << "\""; + } + ss << ")"; + std::string prefix = absl::StrCat("DEPLOY ", sp_info.GetSpName()); + absl::string_view old_sql = sql; + old_sql.remove_prefix(prefix.size()); + sql = absl::StrCat(prefix, ss.str(), old_sql); + } } PrintItemTable({"DB", type_name}, {vec}, stream); diff --git a/src/cmd/openmldb.cc b/src/cmd/openmldb.cc index 8d0d9b692f5..fbad945b336 100644 --- a/src/cmd/openmldb.cc +++ b/src/cmd/openmldb.cc @@ -81,6 +81,8 @@ DECLARE_uint32(max_col_display_length); DECLARE_bool(version); DECLARE_bool(use_name); DECLARE_string(data_dir); +DECLARE_string(user); +DECLARE_string(password); const std::string OPENMLDB_VERSION = std::to_string(OPENMLDB_VERSION_MAJOR) + "." + // NOLINT std::to_string(OPENMLDB_VERSION_MINOR) + "." + @@ -438,7 +440,7 @@ std::shared_ptr<::openmldb::client::TabletClient> GetTabletClient(const ::openml void HandleNSClientSetTTL(const std::vector& parts, ::openmldb::client::NsClient* client) { if (parts.size() < 4) { - std::cout << "bad setttl format, eg settl t1 absolute 10" << std::endl; + std::cout << "bad setttl format, eg settl t1 absolute 10 [index0]" << std::endl; return; } std::string index_name; @@ -1307,14 +1309,14 @@ void HandleNSGet(const std::vector& parts, ::openmldb::client::NsCl if (parts.size() < 4) { std::cout << "get format error. eg: get table_name key ts | get " "table_name key idx_name ts | get table_name=xxx key=xxx " - "index_name=xxx ts=xxx ts_name=xxx " + "index_name=xxx ts=xxx" << std::endl; return; } std::map parameter_map; if (!GetParameterMap("table_name", parts, "=", parameter_map)) { std::cout << "get format error. eg: get table_name=xxx key=xxx " - "index_name=xxx ts=xxx ts_name=xxx " + "index_name=xxx ts=xxx" << std::endl; return; } @@ -1382,7 +1384,7 @@ void HandleNSGet(const std::vector& parts, ::openmldb::client::NsCl return; } ::openmldb::codec::SDKCodec codec(tables[0]); - bool no_schema = tables[0].column_desc_size() == 0 && tables[0].column_desc_size() == 0; + bool no_schema = tables[0].column_desc_size() == 0; if (no_schema) { std::string value; uint64_t ts = 0; @@ -2459,7 +2461,7 @@ void HandleNSClientHelp(const std::vector& parts, ::openmldb::clien printf("ex:man create\n"); } else if (parts[1] == "setttl") { printf("desc: set table ttl \n"); - printf("usage: setttl table_name ttl_type ttl [ts_name]\n"); + printf("usage: setttl table_name ttl_type ttl [index_name], abs ttl unit is minute\n"); printf("ex: setttl t1 absolute 10\n"); printf("ex: setttl t2 latest 5\n"); printf("ex: setttl t3 latest 5 ts1\n"); @@ -3867,18 +3869,25 @@ void StartAPIServer() { PDLOG(WARNING, "Invalid nameserver format"); exit(1); } - auto sdk = new ::openmldb::sdk::StandAloneSDK(vec[0], port); + auto standalone_options = std::make_shared<::openmldb::sdk::StandaloneOptions>(); + standalone_options->host = vec[0]; + standalone_options->port = port; + standalone_options->user = FLAGS_user; + standalone_options->password = FLAGS_password; + auto sdk = new ::openmldb::sdk::StandAloneSDK(standalone_options); if (!sdk->Init() || !api_service->Init(sdk)) { PDLOG(WARNING, "Fail to init"); exit(1); } } else { - ::openmldb::sdk::ClusterOptions cluster_options; - cluster_options.zk_cluster = FLAGS_zk_cluster; - cluster_options.zk_path = FLAGS_zk_root_path; - cluster_options.zk_session_timeout = FLAGS_zk_session_timeout; - cluster_options.zk_auth_schema = FLAGS_zk_auth_schema; - cluster_options.zk_cert = FLAGS_zk_cert; + auto cluster_options = std::make_shared<::openmldb::sdk::SQLRouterOptions>(); + cluster_options->zk_cluster = FLAGS_zk_cluster; + cluster_options->zk_path = FLAGS_zk_root_path; + cluster_options->zk_session_timeout = FLAGS_zk_session_timeout; + cluster_options->zk_auth_schema = FLAGS_zk_auth_schema; + cluster_options->zk_cert = FLAGS_zk_cert; + cluster_options->user = FLAGS_user; + cluster_options->password = FLAGS_password; if (!api_service->Init(cluster_options)) { PDLOG(WARNING, "Fail to init"); exit(1); diff --git a/src/cmd/single_tablet_test.cc b/src/cmd/single_tablet_test.cc index 2c564b30546..bfe3ccedba4 100644 --- a/src/cmd/single_tablet_test.cc +++ b/src/cmd/single_tablet_test.cc @@ -65,9 +65,9 @@ TEST_P(DBSDKTest, CreateFunction) { sr = cli->sr; ::openmldb::sdk::SQLClusterRouter* sr_2 = nullptr; if (cs->IsClusterMode()) { - ::openmldb::sdk::ClusterOptions copt; - copt.zk_cluster = mc.GetZkCluster(); - copt.zk_path = mc.GetZkPath(); + auto copt = std::make_shared(); + copt->zk_cluster = mc.GetZkCluster(); + copt->zk_path = mc.GetZkPath(); auto cur_cs = new ::openmldb::sdk::ClusterSDK(copt); cur_cs->Init(); sr_2 = new ::openmldb::sdk::SQLClusterRouter(cur_cs); @@ -144,9 +144,9 @@ TEST_P(DBSDKTest, CreateUdafFunction) { sr = cli->sr; std::unique_ptr<::openmldb::sdk::SQLClusterRouter> sr_2; if (cs->IsClusterMode()) { - ::openmldb::sdk::ClusterOptions copt; - copt.zk_cluster = mc.GetZkCluster(); - copt.zk_path = mc.GetZkPath(); + auto copt = std::make_shared(); + copt->zk_cluster = mc.GetZkCluster(); + copt->zk_path = mc.GetZkPath(); auto cur_cs = new ::openmldb::sdk::ClusterSDK(copt); cur_cs->Init(); sr_2 = std::make_unique<::openmldb::sdk::SQLClusterRouter>(cur_cs); @@ -232,16 +232,17 @@ int main(int argc, char** argv) { mc.SetUp(1); sleep(5); srand(time(NULL)); - ::openmldb::sdk::ClusterOptions copt; - copt.zk_cluster = mc.GetZkCluster(); - copt.zk_path = mc.GetZkPath(); + auto copt = std::make_shared<::openmldb::sdk::SQLRouterOptions>(); + copt->zk_cluster = mc.GetZkCluster(); + copt->zk_path = mc.GetZkPath(); ::openmldb::cmd::cluster_cli.cs = new ::openmldb::sdk::ClusterSDK(copt); ::openmldb::cmd::cluster_cli.cs->Init(); ::openmldb::cmd::cluster_cli.sr = new ::openmldb::sdk::SQLClusterRouter(::openmldb::cmd::cluster_cli.cs); ::openmldb::cmd::cluster_cli.sr->Init(); env.SetUp(); - ::openmldb::cmd::standalone_cli.cs = new ::openmldb::sdk::StandAloneSDK("127.0.0.1", env.GetNsPort()); + auto sopt = std::make_shared<::openmldb::sdk::StandaloneOptions>("127.0.0.1", env.GetNsPort()); + ::openmldb::cmd::standalone_cli.cs = new ::openmldb::sdk::StandAloneSDK(sopt); ::openmldb::cmd::standalone_cli.cs->Init(); ::openmldb::cmd::standalone_cli.sr = new ::openmldb::sdk::SQLClusterRouter(::openmldb::cmd::standalone_cli.cs); ::openmldb::cmd::standalone_cli.sr->Init(); diff --git a/src/cmd/sql_cmd.h b/src/cmd/sql_cmd.h index 6b8eae72afb..56f6df6b5a4 100644 --- a/src/cmd/sql_cmd.h +++ b/src/cmd/sql_cmd.h @@ -16,6 +16,10 @@ #ifndef SRC_CMD_SQL_CMD_H_ #define SRC_CMD_SQL_CMD_H_ + +#include +#include + #include #include #include @@ -46,6 +50,8 @@ DECLARE_string(zk_cert); DECLARE_int32(zk_session_timeout); DECLARE_uint32(zk_log_level); DECLARE_string(zk_log_file); +DECLARE_string(user); +DECLARE_string(password); // stand-alone mode DECLARE_string(host); @@ -143,6 +149,30 @@ std::string ExecFetch(const std::string& sql) { return ss.str(); } +base::Status GetPassword(std::string* password) { + // refer https://www.gnu.org/software/libc/manual/html_mono/libc.html#getpass + struct termios old_attr, new_attr; + if (tcgetattr(fileno(stdin), &old_attr) != 0) { + return {base::ReturnCode::kError, "tcgetattr execute failed!"}; + } + new_attr = old_attr; + new_attr.c_lflag &= ~ECHO; + if (tcsetattr(fileno(stdin), TCSAFLUSH, &new_attr) != 0) { + return {base::ReturnCode::kError, "tcsetattr execute failed!"}; + } + size_t len = 0; + char* lineptr = nullptr; + if (ssize_t nread = getline(&lineptr, &len, stdin); nread == -1) { + free(lineptr); + return {base::ReturnCode::kError, "read input failed!"}; + } else if (nread > 1) { + password->assign(lineptr, nread - 1); + } + free(lineptr); + (void) tcsetattr(fileno(stdin), TCSAFLUSH, &old_attr); + return {}; +} + void HandleSQL(const std::string& sql) { std::cout << ExecFetch(sql); } std::string SafeGetString(std::shared_ptr rs, int idx) { @@ -263,16 +293,28 @@ void Shell() { } bool InitClusterSDK() { + auto options = std::make_shared(); ::openmldb::sdk::ClusterOptions copt; - copt.zk_cluster = FLAGS_zk_cluster; - copt.zk_path = FLAGS_zk_root_path; - copt.zk_session_timeout = FLAGS_zk_session_timeout; - copt.zk_log_level = FLAGS_zk_log_level; - copt.zk_log_file = FLAGS_zk_log_file; - copt.zk_auth_schema = FLAGS_zk_auth_schema; - copt.zk_cert = FLAGS_zk_cert; - - cs = new ::openmldb::sdk::ClusterSDK(copt); + options->zk_cluster = FLAGS_zk_cluster; + options->zk_path = FLAGS_zk_root_path; + options->zk_session_timeout = FLAGS_zk_session_timeout; + options->zk_log_level = FLAGS_zk_log_level; + options->zk_log_file = FLAGS_zk_log_file; + options->zk_auth_schema = FLAGS_zk_auth_schema; + options->zk_cert = FLAGS_zk_cert; + options->spark_conf_path = FLAGS_spark_conf; + options->request_timeout = FLAGS_request_timeout; + options->user = FLAGS_user; + options->password = FLAGS_password; + if (!::google::GetCommandLineFlagInfoOrDie("user").is_default && + ::google::GetCommandLineFlagInfoOrDie("password").is_default) { + std::cout << "Please enter password:" << std::endl; + if (auto status = GetPassword(&options->password); !status.OK()) { + std::cout << status.GetMsg() << std::endl; + return false; + } + } + cs = new ::openmldb::sdk::ClusterSDK(options); if (!cs->Init()) { std::cout << "ERROR: Failed to connect to db" << std::endl; return false; @@ -283,11 +325,6 @@ bool InitClusterSDK() { return false; } sr->SetInteractive(FLAGS_interactive); - - auto ops = std::dynamic_pointer_cast(sr->GetRouterOptions()); - ops->spark_conf_path = FLAGS_spark_conf; - ops->request_timeout = FLAGS_request_timeout; - return true; } @@ -306,7 +343,19 @@ bool InitStandAloneSDK() { std::cout << "ERROR: Host or port is missing" << std::endl; return false; } - cs = new ::openmldb::sdk::StandAloneSDK(FLAGS_host, FLAGS_port); + auto options = std::make_shared(FLAGS_host, FLAGS_port); + options->user = FLAGS_user; + options->password = FLAGS_password; + if (!::google::GetCommandLineFlagInfoOrDie("user").is_default && + ::google::GetCommandLineFlagInfoOrDie("password").is_default) { + std::cout << "Please enter password:" << std::endl; + if (auto status = GetPassword(&options->password); !status.OK()) { + std::cout << status.GetMsg() << std::endl; + return false; + } + } + options->request_timeout = FLAGS_request_timeout; + cs = new ::openmldb::sdk::StandAloneSDK(options); bool ok = cs->Init(); if (!ok) { std::cout << "ERROR: Failed to connect to db" << std::endl; @@ -318,8 +367,6 @@ bool InitStandAloneSDK() { return false; } sr->SetInteractive(FLAGS_interactive); - auto ops = sr->GetRouterOptions(); - ops->request_timeout = FLAGS_request_timeout; return true; } diff --git a/src/cmd/sql_cmd_test.cc b/src/cmd/sql_cmd_test.cc index cdff3943254..b575053324d 100644 --- a/src/cmd/sql_cmd_test.cc +++ b/src/cmd/sql_cmd_test.cc @@ -236,6 +236,54 @@ TEST_F(SqlCmdTest, SelectIntoOutfile) { remove(file_path.c_str()); } +TEST_P(DBSDKTest, TestUser) { + auto cli = GetParam(); + cs = cli->cs; + sr = cli->sr; + hybridse::sdk::Status status; + sr->ExecuteSQL(absl::StrCat("CREATE USER user1 OPTIONS(password='123456')"), &status); + ASSERT_TRUE(status.IsOK()); + sr->ExecuteSQL(absl::StrCat("CREATE USER user1 OPTIONS(password='123456')"), &status); + ASSERT_FALSE(status.IsOK()); + sr->ExecuteSQL(absl::StrCat("CREATE USER IF NOT EXISTS user1"), &status); + ASSERT_TRUE(status.IsOK()); + ASSERT_TRUE(true); + auto opt = sr->GetRouterOptions(); + if (cs->IsClusterMode()) { + auto real_opt = std::dynamic_pointer_cast(opt); + sdk::SQLRouterOptions opt1; + opt1.zk_cluster = real_opt->zk_cluster; + opt1.zk_path = real_opt->zk_path; + opt1.user = "user1"; + opt1.password = "123456"; + auto router = NewClusterSQLRouter(opt1); + ASSERT_TRUE(router != nullptr); + sr->ExecuteSQL(absl::StrCat("ALTER USER user1 SET OPTIONS(password='abc')"), &status); + ASSERT_TRUE(status.IsOK()); + router = NewClusterSQLRouter(opt1); + ASSERT_FALSE(router != nullptr); + } else { + auto real_opt = std::dynamic_pointer_cast(opt); + sdk::StandaloneOptions opt1; + opt1.host = real_opt->host; + opt1.port = real_opt->port; + opt1.user = "user1"; + opt1.password = "123456"; + auto router = NewStandaloneSQLRouter(opt1); + ASSERT_TRUE(router != nullptr); + sr->ExecuteSQL(absl::StrCat("ALTER USER user1 SET OPTIONS(password='abc')"), &status); + ASSERT_TRUE(status.IsOK()); + router = NewStandaloneSQLRouter(opt1); + ASSERT_FALSE(router != nullptr); + } + sr->ExecuteSQL(absl::StrCat("DROP USER user1"), &status); + ASSERT_TRUE(status.IsOK()); + sr->ExecuteSQL(absl::StrCat("DROP USER user1"), &status); + ASSERT_FALSE(status.IsOK()); + sr->ExecuteSQL(absl::StrCat("DROP USER IF EXISTS user1"), &status); + ASSERT_TRUE(status.IsOK()); +} + TEST_P(DBSDKTest, CreateDatabase) { auto cli = GetParam(); cs = cli->cs; @@ -532,7 +580,7 @@ TEST_F(SqlCmdTest, InsertWithDB) { sr, {"create database test1;", "create database test2;", "use test1;", "create table trans (c1 string, c2 int);", "use test2;", "insert into test1.trans values ('aaa', 123);"}); - auto cur_cs = new ::openmldb::sdk::StandAloneSDK(FLAGS_host, FLAGS_port); + auto cur_cs = new ::openmldb::sdk::StandAloneSDK(std::make_shared(FLAGS_host, FLAGS_port)); cur_cs->Init(); auto cur_sr = std::make_unique<::openmldb::sdk::SQLClusterRouter>(cur_cs); cur_sr->Init(); @@ -1190,7 +1238,8 @@ TEST_P(DBSDKTest, DeletetSameColIndex) { auto res = sr->ExecuteSQL(absl::StrCat("select * from ", table_name, ";"), &status); ASSERT_EQ(res->Size(), 100); - ProcessSQLs(sr, {absl::StrCat("delete from ", table_name, " where c1 = 'key2';")}); + sr->ExecuteSQL(absl::StrCat("delete from ", table_name, " where c1 = 'key2';"), &status); + ASSERT_TRUE(status.IsOK()) << status.msg; res = sr->ExecuteSQL(absl::StrCat("select * from ", table_name, ";"), &status); ASSERT_EQ(res->Size(), 90); ProcessSQLs(sr, { @@ -1199,6 +1248,125 @@ TEST_P(DBSDKTest, DeletetSameColIndex) { }); } +TEST_P(DBSDKTest, TestDelete) { + auto cli = GetParam(); + sr = cli->sr; + std::string name = "test" + GenRand(); + ::hybridse::sdk::Status status; + std::string ddl; + + std::string db = "db" + GenRand(); + ASSERT_TRUE(sr->CreateDB(db, &status)); + sr->ExecuteSQL(db, "set @@execute_mode = 'online';", &status); + sr->ExecuteSQL(db, "use " + db + " ;", &status); + ddl = absl::StrCat("create table ", name, + "(col1 string, col2 string, col3 string, col4 bigint, col5 bigint, col6 bigint, col7 string," + "index(key=col1, ts=col4), index(key=(col1, col2), ts=col4), index(key=col3, ts=col5));"); + ASSERT_TRUE(sr->ExecuteDDL(db, ddl, &status)) << "ddl: " << ddl; + ASSERT_TRUE(sr->RefreshCatalog()); + for (int i = 0; i < 10; i++) { + std::string key1 = absl::StrCat("key1_", i); + std::string key2 = absl::StrCat("key2_", i); + std::string key3 = absl::StrCat("key3_", i); + for (int j = 0; j < 10; j++) { + sr->ExecuteSQL(absl::StrCat("insert into ", name, + " values ('", key1, "', '", key2, "', '", key3, "', ", 100 + j, ",", 1000 + j, ", 1, 'v');"), + &status); + } + } + auto rs = sr->ExecuteSQL(db, "select * from " + name + ";", &status); + ASSERT_EQ(rs->Size(), 100); + rs = sr->ExecuteSQL(db, "delete from " + name + " where col1 = 'xxx' and col5 > 100;", &status); + ASSERT_FALSE(status.IsOK()); + rs = sr->ExecuteSQL(db, "delete from " + name + " where col1 = 'xxx' and col6 > 100;", &status); + ASSERT_FALSE(status.IsOK()); + rs = sr->ExecuteSQL(db, "delete from " + name + " where col1 = 'xxx' and col3 = 'aaa';", &status); + ASSERT_FALSE(status.IsOK()); + rs = sr->ExecuteSQL(db, "delete from " + name + " where col7 = 'xxx' and col3 = 'aaa';", &status); + ASSERT_FALSE(status.IsOK()); + sr->ExecuteSQL(db, "delete from " + name + " where col6 > 100;", &status); + ASSERT_FALSE(status.IsOK()); + rs = sr->ExecuteSQL(db, "delete from " + name + " where col1 = 'key1_1';", &status); + ASSERT_TRUE(status.IsOK()); + rs = sr->ExecuteSQL(db, "select * from " + name + " where col1 = 'key1_1';", &status); + ASSERT_EQ(rs->Size(), 0); + rs = sr->ExecuteSQL(db, "select * from " + name + " where col1 = 'key1_1' and col2 = 'key2_1';", &status); + ASSERT_EQ(rs->Size(), 0); + rs = sr->ExecuteSQL(db, "select * from " + name + " where col3 = 'key3_1';", &status); + ASSERT_EQ(rs->Size(), 0); + sr->ExecuteSQL(db, "delete from " + name + " where col4 > 105;", &status); + ASSERT_TRUE(status.IsOK()); + rs = sr->ExecuteSQL(db, "select * from " + name + " where col1 = 'key1_2';", &status); + ASSERT_EQ(rs->Size(), 6); + rs = sr->ExecuteSQL(db, "select * from " + name + " where col1 = 'key1_2' and col2 = 'key2_2';", &status); + ASSERT_EQ(rs->Size(), 6); + rs = sr->ExecuteSQL(db, "select * from " + name + " where col3 = 'key3_2';", &status); + ASSERT_EQ(rs->Size(), 6); + + ASSERT_TRUE(sr->ExecuteDDL(db, "drop table " + name + ";", &status)); + ASSERT_TRUE(sr->DropDB(db, &status)); +} + + +TEST_P(DBSDKTest, DeletetMulIndex) { + auto cli = GetParam(); + sr = cli->sr; + std::string db_name = "test2"; + std::string table_name = "test1"; + std::string ddl = + "create table test1 (c1 string, c2 string, c3 bigint, c4 bigint, " + "INDEX(KEY=c1, ts=c3), INDEX(KEY=c2, ts=c4));"; + ProcessSQLs(sr, { + "set @@execute_mode = 'online'", + absl::StrCat("create database ", db_name, ";"), + absl::StrCat("use ", db_name, ";"), + ddl, + }); + hybridse::sdk::Status status; + for (int i = 0; i < 10; i++) { + std::string key1 = absl::StrCat("key1_", i); + std::string key2 = absl::StrCat("key2_", i); + for (int j = 0; j < 10; j++) { + uint64_t ts = 1000 + j; + sr->ExecuteSQL(absl::StrCat("insert into ", table_name, + " values ('", key1, "', '", key2, "', ", ts, ",", ts, ");"), + &status); + } + } + + auto res = sr->ExecuteSQL(absl::StrCat("select * from ", table_name, ";"), &status); + ASSERT_EQ(res->Size(), 100); + res = sr->ExecuteSQL(absl::StrCat("select * from ", table_name, " where c1 = \'key1_2\';"), &status); + ASSERT_EQ(res->Size(), 10); + res = sr->ExecuteSQL(absl::StrCat("select * from ", table_name, " where c2 = \'key2_2\';"), &status); + ASSERT_EQ(res->Size(), 10); + sr->ExecuteSQL(absl::StrCat("delete from ", table_name, " where c1 = 'key1_2' and c3 = 1001;"), &status); + res = sr->ExecuteSQL(absl::StrCat("select * from ", table_name, " where c1 = \'key1_2\';"), &status); + ASSERT_EQ(res->Size(), 9); + res = sr->ExecuteSQL(absl::StrCat("select * from ", table_name, " where c2 = \'key2_2\';"), &status); + ASSERT_EQ(res->Size(), 9); + sr->ExecuteSQL(absl::StrCat("delete from ", table_name, " where c1 = 'key1_2';"), &status); + ASSERT_TRUE(status.IsOK()) << status.msg; + res = sr->ExecuteSQL(absl::StrCat("select * from ", table_name, ";"), &status); + ASSERT_EQ(res->Size(), 90); + res = sr->ExecuteSQL(absl::StrCat("select * from ", table_name, " where c1 = \'key1_2\';"), &status); + ASSERT_EQ(res->Size(), 0); + res = sr->ExecuteSQL(absl::StrCat("select * from ", table_name, " where c2 = \'key2_2\';"), &status); + ASSERT_EQ(res->Size(), 0); + sr->ExecuteSQL(absl::StrCat("delete from ", table_name, " where c3 >= 1005 ;"), &status); + ASSERT_TRUE(status.IsOK()) << status.msg; + res = sr->ExecuteSQL(absl::StrCat("select * from ", table_name, ";"), &status); + ASSERT_EQ(res->Size(), 45); + res = sr->ExecuteSQL(absl::StrCat("select * from ", table_name, " where c1 = \'key1_3\';"), &status); + ASSERT_EQ(res->Size(), 5); + res = sr->ExecuteSQL(absl::StrCat("select * from ", table_name, " where c2 = \'key2_3\';"), &status); + ASSERT_EQ(res->Size(), 5); + ProcessSQLs(sr, { + absl::StrCat("drop table ", table_name), + absl::StrCat("drop database ", db_name), + }); +} + TEST_P(DBSDKTest, SQLDeletetRow) { auto cli = GetParam(); sr = cli->sr; @@ -3274,6 +3442,7 @@ TEST_P(DBSDKTest, ShowComponents) { void ExpectShowTableStatusResult(const std::vector>& expect, hybridse::sdk::ResultSet* rs, bool all_db = false, bool is_cluster = false) { static const std::vector> SystemClusterTableStatus = { + {{}, "USER", "__INTERNAL_DB", "memory", {}, {}, {}, "1", "0", "1", "NULL", "NULL", "NULL", ""}, {{}, "PRE_AGG_META_INFO", "__INTERNAL_DB", "memory", {}, {}, {}, "1", "0", "1", "NULL", "NULL", "NULL", ""}, {{}, "JOB_INFO", "__INTERNAL_DB", "memory", "0", {}, {}, "1", "0", "1", "NULL", "NULL", "NULL", ""}, {{}, @@ -3306,6 +3475,7 @@ void ExpectShowTableStatusResult(const std::vector> SystemStandaloneTableStatus = { + {{}, "USER", "__INTERNAL_DB", "memory", {}, {}, {}, "1", "0", "1", "NULL", "NULL", "NULL", ""}, {{}, "PRE_AGG_META_INFO", "__INTERNAL_DB", "memory", {}, {}, {}, "1", "0", "1", "NULL", "NULL", "NULL", ""}, {{}, "GLOBAL_VARIABLES", @@ -3972,10 +4142,10 @@ int main(int argc, char** argv) { int ok = ::openmldb::cmd::mc_->SetUp(2); sleep(5); srand(time(NULL)); - ::openmldb::sdk::ClusterOptions copt; - copt.zk_cluster = mc.GetZkCluster(); - copt.zk_path = mc.GetZkPath(); - copt.zk_session_timeout = FLAGS_zk_session_timeout; + auto copt = std::make_shared<::openmldb::sdk::SQLRouterOptions>(); + copt->zk_cluster = mc.GetZkCluster(); + copt->zk_path = mc.GetZkPath(); + copt->zk_session_timeout = FLAGS_zk_session_timeout; ::openmldb::cmd::cluster_cli.cs = new ::openmldb::sdk::ClusterSDK(copt); ::openmldb::cmd::cluster_cli.cs->Init(); ::openmldb::cmd::cluster_cli.sr = new ::openmldb::sdk::SQLClusterRouter(::openmldb::cmd::cluster_cli.cs); @@ -3984,7 +4154,8 @@ int main(int argc, char** argv) { env.SetUp(); FLAGS_host = "127.0.0.1"; FLAGS_port = env.GetNsPort(); - ::openmldb::cmd::standalone_cli.cs = new ::openmldb::sdk::StandAloneSDK(FLAGS_host, FLAGS_port); + auto sopt = std::make_shared<::openmldb::sdk::StandaloneOptions>(FLAGS_host, FLAGS_port); + ::openmldb::cmd::standalone_cli.cs = new ::openmldb::sdk::StandAloneSDK(sopt); ::openmldb::cmd::standalone_cli.cs->Init(); ::openmldb::cmd::standalone_cli.sr = new ::openmldb::sdk::SQLClusterRouter(::openmldb::cmd::standalone_cli.cs); ::openmldb::cmd::standalone_cli.sr->Init(); diff --git a/src/codec/codec_test.cc b/src/codec/codec_test.cc index 6c6ae99f804..054f431dfca 100644 --- a/src/codec/codec_test.cc +++ b/src/codec/codec_test.cc @@ -19,6 +19,7 @@ #include #include "boost/container/deque.hpp" +#include "codec/encrypt.h" #include "codec/row_codec.h" #include "gtest/gtest.h" #include "proto/common.pb.h" @@ -541,6 +542,13 @@ TEST_F(CodecTest, RowBuilderSet) { ASSERT_EQ(ts, 1668149927000); } +TEST_F(CodecTest, Encrypt) { + ASSERT_EQ(SHA256("root"), "4813494d137e1631bba301d5acab6e7bb7aa74ce1185d456565ef51d737677b2"); + ASSERT_EQ(SHA256(""), "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"); + ASSERT_EQ(Encrypt("root"), "14813494d137e1631bba301d5acab6e7bb7aa74ce1185d456565ef51d737677b2"); + ASSERT_EQ(Encrypt(""), "1e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"); +} + } // namespace codec } // namespace openmldb diff --git a/src/codec/encrypt.h b/src/codec/encrypt.h new file mode 100644 index 00000000000..9fcbd82aa59 --- /dev/null +++ b/src/codec/encrypt.h @@ -0,0 +1,52 @@ +/* + * Copyright 2021 4Paradigm + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SRC_CODEC_ENCRYPT_H_ +#define SRC_CODEC_ENCRYPT_H_ + +#include +#include +#include + +#include "absl/strings/str_cat.h" +#include "openssl/sha.h" + +namespace openmldb { +namespace codec { + +inline constexpr uint8_t VERSION = 1; + +static inline std::string SHA256(const std::string& str) { + unsigned char hash[SHA256_DIGEST_LENGTH]; + SHA256_CTX sha256; + SHA256_Init(&sha256); + SHA256_Update(&sha256, str.c_str(), str.size()); + SHA256_Final(hash, &sha256); + std::stringstream ss; + for (int i = 0; i < SHA256_DIGEST_LENGTH; i++) { + ss << std::hex << std::setw(2) << std::setfill('0') << static_cast(hash[i]); + } + return ss.str(); +} + +static inline std::string Encrypt(const std::string& passwd) { + return absl::StrCat(VERSION, SHA256(passwd)); +} + +} // namespace codec +} // namespace openmldb + +#endif // SRC_CODEC_ENCRYPT_H_ diff --git a/src/examples/test_udf.cc b/src/examples/test_udf.cc index 58438c61699..f4c3bced6ca 100644 --- a/src/examples/test_udf.cc +++ b/src/examples/test_udf.cc @@ -91,7 +91,7 @@ int64_t count_null_output(::openmldb::base::UDFContext* ctx) { return *(reinterpret_cast(ctx->ptr)); } -// Get the second non-null value of all values +// Get the third non-null value of all values extern "C" ::openmldb::base::UDFContext* third_init(::openmldb::base::UDFContext* ctx) { ctx->ptr = reinterpret_cast(new std::vector()); @@ -114,8 +114,45 @@ void third_output(::openmldb::base::UDFContext* ctx, int64_t* output, bool* is_n *is_null = true; } else { *is_null = false; - *output = vec->at(3); + *output = vec->at(2); } // free the memory allocated in init function with new/malloc delete vec; } + +// Get the first non-null value >= threshold +extern "C" +::openmldb::base::UDFContext* first_ge_init(::openmldb::base::UDFContext* ctx) { + // threshold init in update + // threshold, thresh_flag, first_ge, first_ge_flag + ctx->ptr = reinterpret_cast(new std::vector(4, 0)); + return ctx; +} + +extern "C" +::openmldb::base::UDFContext* first_ge_update(::openmldb::base::UDFContext* ctx, int64_t input, bool is_null, int64_t threshold, bool threshold_is_null) { + auto pair = reinterpret_cast*>(ctx->ptr); + if (!threshold_is_null && pair->at(1) == 0) { + pair->at(0) = threshold; + pair->at(1) = 1; + } + if (!is_null && pair->at(3) == 0 && input >= pair->at(0)) { + pair->at(2) = input; + pair->at(3) = 1; + } + return ctx; +} + +extern "C" +void first_ge_output(::openmldb::base::UDFContext* ctx, int64_t* output, bool* is_null) { + auto pair = reinterpret_cast*>(ctx->ptr); + // threshold is null or no value >= threshold + if (pair->at(1) == 0 || pair->at(3) == 0) { + *is_null = true; + } else { + *is_null = false; + *output = pair->at(2); + } + // free the memory allocated in init function with new/malloc + delete pair; +} diff --git a/src/flags.cc b/src/flags.cc index b16af056095..be92d0162ef 100644 --- a/src/flags.cc +++ b/src/flags.cc @@ -24,6 +24,8 @@ DEFINE_string(openmldb_log_dir, "./logs", "config the log dir of glog, for all l DEFINE_string(role, "", "Set the openmldb role for start: tablet | nameserver | client | ns_client | sql_client | apiserver"); DEFINE_string(cmd, "", "the command str, DO NOT add multi sqls"); +DEFINE_string(user, "root", "specify the user"); +DEFINE_string(password, "", "config the password"); DEFINE_int32(zk_session_timeout, 2000, "config the zk session timeout of cli in milliseconds, apiserver, tablet or nameserver"); DEFINE_uint32(tablet_heartbeat_timeout, 5 * 60 * 1000, "config the heartbeat of tablet offline. unit is milliseconds"); diff --git a/src/nameserver/name_server_impl.cc b/src/nameserver/name_server_impl.cc index d5cda3ae537..41065d61795 100644 --- a/src/nameserver/name_server_impl.cc +++ b/src/nameserver/name_server_impl.cc @@ -5470,6 +5470,10 @@ void NameServerImpl::OnLocked() { } } + if (FLAGS_system_table_replica_num > 0 && db_table_info_[INTERNAL_DB].count(USER_INFO_NAME) == 0) { + CreateSystemTableOrExit(SystemTableType::kUser); + } + if (FLAGS_system_table_replica_num > 0 && db_table_info_[INTERNAL_DB].count(PRE_AGG_META_NAME) == 0) { CreateSystemTableOrExit(SystemTableType::kPreAggMetaInfo); } @@ -9916,62 +9920,6 @@ base::Status NameServerImpl::InitGlobalVarTable() { return {}; } -/// \beirf create a SQLClusterRouter instance for use like monitoring statistics collecting -/// the actual instance is stored in `sr_` member -/// -/// \return true if action success, false if any error happens -bool NameServerImpl::GetSdkConnection() { - if (std::atomic_load_explicit(&sr_, std::memory_order_acquire) == nullptr) { - sdk::DBSDK* cs = nullptr; - PDLOG(INFO, "Init ClusterSDK in name server"); - if (IsClusterMode()) { - ::openmldb::sdk::ClusterOptions copt; - copt.zk_cluster = zk_path_.zk_cluster_; - copt.zk_path = zk_path_.root_path_; - cs = new ::openmldb::sdk::ClusterSDK(copt); - } else { - std::vector list = absl::StrSplit(endpoint_, ":"); - if (list.size() != 2) { - PDLOG(ERROR, "fail to split endpoint_"); - return false; - } - - int port = 0; - if (!absl::SimpleAtoi(list.at(1), &port)) { - PDLOG(ERROR, "fail to port string: %s", list.at(1)); - return false; - } - cs = new ::openmldb::sdk::StandAloneSDK(list.at(0), port); - } - bool ok = cs->Init(); - if (!ok) { - PDLOG(ERROR, "ERROR: Failed to init DBSDK"); - if (cs != nullptr) { - delete cs; - } - return false; - } - auto sr = std::make_shared<::openmldb::sdk::SQLClusterRouter>(cs); - if (!sr->Init()) { - PDLOG(ERROR, "fail to init SQLClusterRouter"); - if (cs != nullptr) { - delete cs; - } - return false; - } - - std::atomic_store_explicit(&sr_, sr, std::memory_order_release); - } - - return true; -} - -void NameServerImpl::FreeSdkConnection() { - if (std::atomic_load_explicit(&sr_, std::memory_order_acquire) != nullptr) { - std::atomic_store_explicit(&sr_, {}, std::memory_order_release); - } -} - std::shared_ptr NameServerImpl::CreateTaskInternal(const TaskMeta* task_meta) { auto task_type = task_meta->task_info->task_type(); std::shared_ptr client; diff --git a/src/nameserver/name_server_impl.h b/src/nameserver/name_server_impl.h index c8f5c56b04d..00ea95d7de6 100644 --- a/src/nameserver/name_server_impl.h +++ b/src/nameserver/name_server_impl.h @@ -673,10 +673,6 @@ class NameServerImpl : public NameServer { uint64_t GetTerm() const; - bool GetSdkConnection(); - - void FreeSdkConnection(); - bool RecoverExternalFunction(); ::openmldb::base::Status CheckZoneInfo(const ::openmldb::nameserver::ZoneInfo& zone_info); @@ -733,9 +729,6 @@ class NameServerImpl : public NameServer { std::unordered_map>> db_sp_info_map_; ::openmldb::type::StartupMode startup_mode_; - - // sr_ could be a real instance or nothing, remember always use atomic_* function to access it - std::shared_ptr<::openmldb::sdk::SQLClusterRouter> sr_ = nullptr; }; } // namespace nameserver diff --git a/src/nameserver/system_table.cc b/src/nameserver/system_table.cc index 830725a8d4a..38f9a9e8c18 100644 --- a/src/nameserver/system_table.cc +++ b/src/nameserver/system_table.cc @@ -27,6 +27,7 @@ static absl::flat_hash_map CreateSystemT {SystemTableType::kPreAggMetaInfo, {INTERNAL_DB, PRE_AGG_META_NAME}}, {SystemTableType::kGlobalVariable, {INFORMATION_SCHEMA_DB, GLOBAL_VARIABLES}}, {SystemTableType::kDeployResponseTime, {INFORMATION_SCHEMA_DB, DEPLOY_RESPONSE_TIME}}, + {SystemTableType::kUser, {INTERNAL_DB, USER_INFO_NAME}}, }; return map; } diff --git a/src/nameserver/system_table.h b/src/nameserver/system_table.h index bec114f8725..63bb507b68a 100644 --- a/src/nameserver/system_table.h +++ b/src/nameserver/system_table.h @@ -34,6 +34,7 @@ constexpr const char* INTERNAL_DB = "__INTERNAL_DB"; constexpr const char* PRE_AGG_DB = "__PRE_AGG_DB"; constexpr const char* JOB_INFO_NAME = "JOB_INFO"; constexpr const char* PRE_AGG_META_NAME = "PRE_AGG_META_INFO"; +constexpr const char* USER_INFO_NAME = "USER"; constexpr const char* INFORMATION_SCHEMA_DB = "INFORMATION_SCHEMA"; @@ -47,6 +48,7 @@ enum class SystemTableType { kPreAggMetaInfo = 2, kGlobalVariable = 3, kDeployResponseTime, + kUser, }; struct SystemTableInfo { @@ -159,6 +161,26 @@ class SystemTable { ttl->set_lat_ttl(1); break; } + case SystemTableType::kUser: { + SetColumnDesc("host", type::DataType::kString, table_info->add_column_desc()); + SetColumnDesc("user", type::DataType::kString, table_info->add_column_desc()); + SetColumnDesc("password", type::DataType::kString, table_info->add_column_desc()); + SetColumnDesc("password_last_changed", type::DataType::kTimestamp, table_info->add_column_desc()); + SetColumnDesc("password_expired_time", type::DataType::kBigInt, table_info->add_column_desc()); + SetColumnDesc("create_time", type::DataType::kTimestamp, table_info->add_column_desc()); + SetColumnDesc("update_time", type::DataType::kTimestamp, table_info->add_column_desc()); + SetColumnDesc("account_type", type::DataType::kInt, table_info->add_column_desc()); + SetColumnDesc("privileges", type::DataType::kString, table_info->add_column_desc()); + SetColumnDesc("extra_info", type::DataType::kString, table_info->add_column_desc()); + auto index = table_info->add_column_key(); + index->set_index_name("index"); + index->add_col_name("host"); + index->add_col_name("user"); + auto ttl = index->mutable_ttl(); + ttl->set_ttl_type(::openmldb::type::kLatestTime); + ttl->set_lat_ttl(1); + break; + } default: return nullptr; } diff --git a/src/nameserver/system_table_test.cc b/src/nameserver/system_table_test.cc index 9af9fdf5a0b..eadca079c93 100644 --- a/src/nameserver/system_table_test.cc +++ b/src/nameserver/system_table_test.cc @@ -69,7 +69,10 @@ TEST_F(SystemTableTest, SystemTable) { std::vector<::openmldb::nameserver::TableInfo> tables; std::string msg; ASSERT_TRUE(ns_client.ShowTable("", INTERNAL_DB, false, tables, msg)); - ASSERT_EQ(2, tables.size()); + ASSERT_EQ(3, tables.size()); + ASSERT_EQ("JOB_INFO", tables[0].name()); + ASSERT_EQ("PRE_AGG_META_INFO", tables[1].name()); + ASSERT_EQ("USER", tables[2].name()); tables.clear(); // deny drop system table ASSERT_FALSE(ns_client.DropDatabase(INTERNAL_DB, msg)); diff --git a/src/proto/tablet.proto b/src/proto/tablet.proto index 2c7a038960b..a1ae6e72d5a 100755 --- a/src/proto/tablet.proto +++ b/src/proto/tablet.proto @@ -195,6 +195,7 @@ message PutRequest { repeated TSDimension ts_dimensions = 7 [deprecated = true]; optional uint32 format_version = 8 [default = 0, deprecated = true]; optional uint32 memory_limit = 9; + optional bool put_if_absent = 10 [default = false]; } message PutResponse { @@ -211,6 +212,7 @@ message DeleteRequest { optional uint64 ts = 6; optional uint64 end_ts = 7; optional string ts_name = 8; + optional bool enable_decode_value = 9 [default = true]; } message ExecuteGcRequest { diff --git a/src/sdk/db_sdk.cc b/src/sdk/db_sdk.cc index a8b08e10259..8a4f951cee1 100644 --- a/src/sdk/db_sdk.cc +++ b/src/sdk/db_sdk.cc @@ -173,15 +173,15 @@ bool DBSDK::RemoveExternalFun(const std::string& name) { return true; } -ClusterSDK::ClusterSDK(const ClusterOptions& options) +ClusterSDK::ClusterSDK(const std::shared_ptr& options) : options_(options), session_id_(0), - table_root_path_(options.zk_path + "/table/db_table_data"), - sp_root_path_(options.zk_path + "/store_procedure/db_sp_data"), - notify_path_(options.zk_path + "/table/notify"), - globalvar_changed_notify_path_(options.zk_path + "/notify/global_variable"), - leader_path_(options.zk_path + "/leader"), - taskmanager_leader_path_(options.zk_path + "/taskmanager/leader"), + table_root_path_(options->zk_path + "/table/db_table_data"), + sp_root_path_(options->zk_path + "/store_procedure/db_sp_data"), + notify_path_(options->zk_path + "/table/notify"), + globalvar_changed_notify_path_(options->zk_path + "/notify/global_variable"), + leader_path_(options->zk_path + "/leader"), + taskmanager_leader_path_(options->zk_path + "/taskmanager/leader"), zk_client_(nullptr), pool_(1) {} @@ -212,18 +212,18 @@ void ClusterSDK::CheckZk() { } bool ClusterSDK::Init() { - zk_client_ = new ::openmldb::zk::ZkClient(options_.zk_cluster, "", - options_.zk_session_timeout, "", - options_.zk_path, - options_.zk_auth_schema, - options_.zk_cert); + zk_client_ = new ::openmldb::zk::ZkClient(options_->zk_cluster, "", + options_->zk_session_timeout, "", + options_->zk_path, + options_->zk_auth_schema, + options_->zk_cert); - bool ok = zk_client_->Init(options_.zk_log_level, options_.zk_log_file); + bool ok = zk_client_->Init(options_->zk_log_level, options_->zk_log_file); if (!ok) { - LOG(WARNING) << "fail to init zk client with " << options_.to_string(); + LOG(WARNING) << "fail to init zk client with " << options_->to_string(); return false; } - LOG(INFO) << "init zk client with " << options_.to_string() << " and session id " << zk_client_->GetSessionTerm(); + LOG(INFO) << "init zk client with " << options_->to_string() << " and session id " << zk_client_->GetSessionTerm(); ::hybridse::vm::EngineOptions eopt; eopt.SetCompileOnly(true); @@ -244,7 +244,7 @@ void ClusterSDK::WatchNotify() { session_id_ = zk_client_->GetSessionTerm(); zk_client_->CancelWatchItem(notify_path_); zk_client_->WatchItem(notify_path_, [this] { Refresh(); }); - zk_client_->WatchChildren(options_.zk_path + "/data/function", + zk_client_->WatchChildren(options_->zk_path + "/data/function", [this](auto&& PH1) { RefreshExternalFun(std::forward(PH1)); }); zk_client_->WatchChildren(leader_path_, [this](auto&& PH1) { RefreshNsClient(std::forward(PH1)); }); @@ -430,16 +430,19 @@ bool ClusterSDK::BuildCatalog() { return false; } } else { - DLOG(INFO) << "no procedures in db"; + LOG(INFO) << "no procedures in db"; } + // The empty database can't be find if we only get table datas, but database no notify, so we get alldbs from + // nameserver in GetAllDbs() return UpdateCatalog(table_datas, sp_datas); } std::vector DBSDK::GetAllDbs() { - std::lock_guard<::openmldb::base::SpinMutex> lock(mu_); std::vector all_dbs; - for (auto db_name_iter = table_to_tablets_.begin(); db_name_iter != table_to_tablets_.end(); db_name_iter++) { - all_dbs.push_back(db_name_iter->first); + std::string st; + if (!GetNsClient()->ShowDatabase(&all_dbs, st)) { + LOG(WARNING) << "show db from ns failed, msg: " << st; + return {}; } return all_dbs; } @@ -509,7 +512,7 @@ bool ClusterSDK::GetRealEndpointFromZk(const std::string& endpoint, std::string* if (real_endpoint == nullptr) { return false; } - std::string sdk_path = options_.zk_path + "/map/sdkendpoints/" + endpoint; + std::string sdk_path = options_->zk_path + "/map/sdkendpoints/" + endpoint; if (zk_client_->IsExistNode(sdk_path) == 0) { if (!zk_client_->GetNodeValue(sdk_path, *real_endpoint)) { DLOG(WARNING) << "get zk failed! : sdk_path: " << sdk_path; @@ -517,7 +520,7 @@ bool ClusterSDK::GetRealEndpointFromZk(const std::string& endpoint, std::string* } } if (real_endpoint->empty()) { - std::string sname_path = options_.zk_path + "/map/names/" + endpoint; + std::string sname_path = options_->zk_path + "/map/names/" + endpoint; if (zk_client_->IsExistNode(sname_path) == 0) { if (!zk_client_->GetNodeValue(sname_path, *real_endpoint)) { DLOG(WARNING) << "get zk failed! : sname_path: " << sname_path; diff --git a/src/sdk/db_sdk.h b/src/sdk/db_sdk.h index 2d8a4ab2f38..982bdd5a40f 100644 --- a/src/sdk/db_sdk.h +++ b/src/sdk/db_sdk.h @@ -29,6 +29,7 @@ #include "client/tablet_client.h" #include "client/taskmanager_client.h" #include "common/thread_pool.h" +#include "sdk/options.h" #include "vm/catalog.h" #include "vm/engine.h" #include "zk/zk_client.h" @@ -106,6 +107,8 @@ class DBSDK { virtual bool GetNsAddress(std::string* endpoint, std::string* real_endpoint) = 0; + virtual std::shared_ptr GetOptions() const = 0; + bool RegisterExternalFun(const std::shared_ptr& fun); bool RemoveExternalFun(const std::string& name); @@ -138,7 +141,7 @@ class DBSDK { class ClusterSDK : public DBSDK { public: - explicit ClusterSDK(const ClusterOptions& options); + explicit ClusterSDK(const std::shared_ptr& options); ~ClusterSDK() override; bool Init() override; @@ -146,12 +149,13 @@ class ClusterSDK : public DBSDK { bool TriggerNotify(::openmldb::type::NotifyType type) const override; zk::ZkClient* GetZkClient() override { return zk_client_; } - const ClusterOptions& GetClusterOptions() const { return options_; } bool GetNsAddress(std::string* endpoint, std::string* real_endpoint) override; void RefreshExternalFun(const std::vector& funs); + std::shared_ptr GetOptions() const override { return options_; } + protected: bool BuildCatalog() override; bool GetTaskManagerAddress(std::string* endpoint, std::string* real_endpoint) override; @@ -166,7 +170,7 @@ class ClusterSDK : public DBSDK { void RefreshTaskManagerClient(); private: - ClusterOptions options_; + std::shared_ptr options_; uint64_t session_id_; std::string table_root_path_; std::string sp_root_path_; @@ -182,7 +186,7 @@ class ClusterSDK : public DBSDK { class StandAloneSDK : public DBSDK { public: - StandAloneSDK(std::string host, int port) : host_(std::move(host)), port_(port) {} + explicit StandAloneSDK(const std::shared_ptr options) : options_(options) {} ~StandAloneSDK() override { pool_.Stop(false); } bool Init() override; @@ -201,15 +205,17 @@ class StandAloneSDK : public DBSDK { return false; } - const std::string& GetHost() const { return host_; } + std::shared_ptr GetOptions() const override { return options_; } + + const std::string& GetHost() const { return options_->host; } - int GetPort() const { return port_; } + int GetPort() const { return options_->port; } // Before connecting to ns, we only have the host&port // NOTICE: when we call this method, we do not have the correct ns client, do not GetNsClient. bool GetNsAddress(std::string* endpoint, std::string* real_endpoint) override { std::stringstream ss; - ss << host_ << ":" << port_; + ss << GetHost() << ":" << GetPort(); *endpoint = ss.str(); *real_endpoint = ss.str(); return true; @@ -232,8 +238,7 @@ class StandAloneSDK : public DBSDK { } private: - std::string host_; - int port_; + std::shared_ptr options_; ::baidu::common::ThreadPool pool_{1}; }; diff --git a/src/sdk/db_sdk_test.cc b/src/sdk/db_sdk_test.cc index 293faa60179..bdd76cec47f 100644 --- a/src/sdk/db_sdk_test.cc +++ b/src/sdk/db_sdk_test.cc @@ -79,17 +79,17 @@ class DBSDKTest : public ::testing::Test { }; TEST_F(DBSDKTest, smokeEmptyCluster) { - ClusterOptions option; - option.zk_cluster = mc_->GetZkCluster(); - option.zk_path = mc_->GetZkPath(); + auto option = std::make_shared(); + option->zk_cluster = mc_->GetZkCluster(); + option->zk_path = mc_->GetZkPath(); ClusterSDK sdk(option); ASSERT_TRUE(sdk.Init()); } TEST_F(DBSDKTest, smokeTest) { - ClusterOptions option; - option.zk_cluster = mc_->GetZkCluster(); - option.zk_path = mc_->GetZkPath(); + auto option = std::make_shared(); + option->zk_cluster = mc_->GetZkCluster(); + option->zk_path = mc_->GetZkPath(); ClusterSDK sdk(option); ASSERT_TRUE(sdk.Init()); @@ -121,7 +121,7 @@ TEST_F(DBSDKTest, standAloneMode) { ASSERT_TRUE(sep != std::string::npos); auto host = ns.substr(0, sep); auto port = ns.substr(sep + 1); - StandAloneSDK sdk(host, std::stoi(port)); + StandAloneSDK sdk(std::make_shared(host, std::stoi(port))); ASSERT_TRUE(sdk.Init()); CreateTable(); diff --git a/src/sdk/mini_cluster_batch_bm.cc b/src/sdk/mini_cluster_batch_bm.cc index 8dc4e9e665e..296fd739021 100644 --- a/src/sdk/mini_cluster_batch_bm.cc +++ b/src/sdk/mini_cluster_batch_bm.cc @@ -85,9 +85,9 @@ static void BM_SimpleQueryFunction(benchmark::State& state) { // NOLINT rb.AppendInt64(ts); rb.AppendInt64(ts); rb.AppendInt64(ts); - ::openmldb::sdk::ClusterOptions option; - option.zk_cluster = mc->GetZkCluster(); - option.zk_path = mc->GetZkPath(); + auto option = std::make_shared<::openmldb::sdk::SQLRouterOptions>(); + option->zk_cluster = mc->GetZkCluster(); + option->zk_path = mc->GetZkPath(); ::openmldb::sdk::ClusterSDK sdk(option); sdk.Init(); std::vector> tablet; diff --git a/src/sdk/node_adapter.cc b/src/sdk/node_adapter.cc index ef9de07a774..58a0b534b4e 100644 --- a/src/sdk/node_adapter.cc +++ b/src/sdk/node_adapter.cc @@ -133,9 +133,9 @@ hybridse::sdk::Status NodeAdapter::ExtractDeleteOption( } if (match_index_col > 0) { if (!option->ts_name.empty()) { - option->index_map.clear(); + option->idx.reset(); } - if (option->index_map.empty()) { + if (!option->idx.has_value()) { matched_column_key.CopyFrom(column_key); } else { if (column_key.col_name_size() != matched_column_key.col_name_size() || @@ -144,7 +144,8 @@ hybridse::sdk::Status NodeAdapter::ExtractDeleteOption( return {hybridse::common::StatusCode::kCmdError, "hit multiple indexs"}; } } - option->index_map.emplace(index_pos[column_key.index_name()], pk); + option->idx = index_pos[column_key.index_name()]; + option->key = pk; for (const auto& col : matched_column_key.col_name()) { hit_con_col.insert(col); } @@ -153,7 +154,7 @@ hybridse::sdk::Status NodeAdapter::ExtractDeleteOption( } } } - if (!option->ts_name.empty() && !option->index_map.empty() && option->ts_name != matched_column_key.ts_name()) { + if (!option->ts_name.empty() && option->idx.has_value() && option->ts_name != matched_column_key.ts_name()) { return {hybridse::common::StatusCode::kCmdError, "ts name mismatch"}; } for (const auto& con : condition_vec) { @@ -330,7 +331,7 @@ bool NodeAdapter::TransformToTableDef(::hybridse::node::CreatePlanNode* create_n status->code = hybridse::common::kTypeError; return false; } - auto val = TransformDataType(*dynamic_cast(default_val), + auto val = TransformDataType(*dynamic_cast(default_val), add_column_desc->data_type()); if (!val) { status->msg = "default value type mismatch"; @@ -782,4 +783,19 @@ hybridse::sdk::Status NodeAdapter::ExtractCondition(const hybridse::node::Binary return CheckCondition(indexs, conditions); } +absl::StatusOr NodeAdapter::ExtractUserOption(const hybridse::node::OptionsMap& map) { + if (map.empty()) { + return ""; + } else if (map.size() > 1) { + return absl::InvalidArgumentError("only password option allowed"); + } + if (!absl::EqualsIgnoreCase(map.begin()->first, "password")) { + return absl::InvalidArgumentError("invalid option " + map.begin()->first); + } + if (map.begin()->second->GetDataType() != hybridse::node::kVarchar) { + return absl::InvalidArgumentError("the value of password should be string"); + } + return map.begin()->second->GetAsString(); +} + } // namespace openmldb::sdk diff --git a/src/sdk/node_adapter.h b/src/sdk/node_adapter.h index 412ebc2a78c..319c7b1b1fe 100644 --- a/src/sdk/node_adapter.h +++ b/src/sdk/node_adapter.h @@ -23,25 +23,16 @@ #include #include +#include "absl/status/statusor.h" #include "node/node_manager.h" #include "proto/name_server.pb.h" #include "proto/type.pb.h" +#include "sdk/option.h" #include "sdk/sql_delete_row.h" namespace openmldb { namespace sdk { -struct DeleteOption { - DeleteOption(const std::map& index, const std::string& name, - const std::optional& ts1, const std::optional& ts2) : - index_map(index), ts_name(name), start_ts(ts1), end_ts(ts2) {} - DeleteOption() = default; - std::map index_map; - std::string ts_name; - std::optional start_ts = std::nullopt; - std::optional end_ts = std::nullopt; -}; - class NodeAdapter { public: static bool TransformToTableDef(::hybridse::node::CreatePlanNode* create_node, @@ -69,6 +60,8 @@ class NodeAdapter { const std::vector& condition_vec, DeleteOption* option); + static absl::StatusOr ExtractUserOption(const hybridse::node::OptionsMap& map); + private: static hybridse::sdk::Status CheckCondition( const ::google::protobuf::RepeatedPtrField<::openmldb::common::ColumnKey>& indexs, diff --git a/src/sdk/node_adapter_test.cc b/src/sdk/node_adapter_test.cc index e09758b07cd..be86d8a790f 100644 --- a/src/sdk/node_adapter_test.cc +++ b/src/sdk/node_adapter_test.cc @@ -64,7 +64,7 @@ void CheckTablePartition(const ::openmldb::nameserver::TableInfo& table_info, if (table_partition.partition_meta(pos).is_leader()) { ASSERT_EQ(table_partition.partition_meta(pos).endpoint(), leader); } else { - ASSERT_EQ(follower.count(table_partition.partition_meta(pos).endpoint()), 1); + ASSERT_EQ(follower.count(table_partition.partition_meta(pos).endpoint()), (std::size_t)1); } } } @@ -124,11 +124,12 @@ static std::vector cases = { INSTANTIATE_TEST_SUITE_P(NodeAdapter, NodeAdapterTest, testing::ValuesIn(cases)); void CheckDeleteOption(const DeleteOption& option, const DeleteOption& expect_option) { - ASSERT_EQ(option.index_map.size(), expect_option.index_map.size()); - for (const auto& kv : option.index_map) { - auto iter = expect_option.index_map.find(kv.first); - ASSERT_TRUE(iter != expect_option.index_map.end()); - ASSERT_EQ(kv.second, iter->second); + if (option.idx.has_value()) { + ASSERT_TRUE(expect_option.idx.has_value()); + ASSERT_EQ(option.idx.value(), expect_option.idx.value()); + ASSERT_EQ(option.key, expect_option.key); + } else { + ASSERT_FALSE(expect_option.idx.has_value()); } ASSERT_EQ(expect_option.ts_name, option.ts_name); if (option.start_ts.has_value()) { @@ -143,6 +144,7 @@ void CheckDeleteOption(const DeleteOption& option, const DeleteOption& expect_op } else { ASSERT_FALSE(expect_option.end_ts.has_value()); } + ASSERT_EQ(option.enable_decode_value, expect_option.enable_decode_value); } struct DeleteOptionParm { @@ -165,51 +167,51 @@ TEST_P(DeleteOptionTest, TransformToTableInfo) { std::vector option_cases = { DeleteOptionParm({Condition("card", hybridse::node::FnOperator::kFnOpEq, "key1", type::DataType::kString)}, - DeleteOption({{0, "key1"}}, "", std::nullopt, std::nullopt)), + DeleteOption(0, "key1", "", std::nullopt, std::nullopt)), DeleteOptionParm({Condition("card", hybridse::node::FnOperator::kFnOpEq, "key1", type::DataType::kString), Condition("ts1", hybridse::node::FnOperator::kFnOpEq, "10", type::DataType::kBigInt)}, - DeleteOption({{0, "key1"}}, "ts1", 10, 9)), + DeleteOption(0, "key1", "ts1", 10, 9)), DeleteOptionParm({Condition("card", hybridse::node::FnOperator::kFnOpEq, "key1", type::DataType::kString), Condition("ts1", hybridse::node::FnOperator::kFnOpGe, "10", type::DataType::kBigInt)}, - DeleteOption({{0, "key1"}}, "ts1", std::nullopt, 9)), + DeleteOption(0, "key1", "ts1", std::nullopt, 9)), DeleteOptionParm({Condition("card", hybridse::node::FnOperator::kFnOpEq, "key1", type::DataType::kString), Condition("ts1", hybridse::node::FnOperator::kFnOpGt, "10", type::DataType::kBigInt)}, - DeleteOption({{0, "key1"}}, "ts1", std::nullopt, 10)), + DeleteOption(0, "key1", "ts1", std::nullopt, 10)), DeleteOptionParm({Condition("card", hybridse::node::FnOperator::kFnOpEq, "key1", type::DataType::kString), Condition("ts1", hybridse::node::FnOperator::kFnOpLt, "10", type::DataType::kBigInt)}, - DeleteOption({{0, "key1"}}, "ts1", 9, std::nullopt)), + DeleteOption(0, "key1", "ts1", 9, std::nullopt)), DeleteOptionParm({Condition("card", hybridse::node::FnOperator::kFnOpEq, "key1", type::DataType::kString), Condition("ts1", hybridse::node::FnOperator::kFnOpLe, "10", type::DataType::kBigInt)}, - DeleteOption({{0, "key1"}}, "ts1", 10, std::nullopt)), + DeleteOption(0, "key1", "ts1", 10, std::nullopt)), DeleteOptionParm({Condition("card", hybridse::node::FnOperator::kFnOpEq, "key1", type::DataType::kString), Condition("ts1", hybridse::node::FnOperator::kFnOpGe, "0", type::DataType::kBigInt)}, - DeleteOption({{0, "key1"}}, "ts1", std::nullopt, std::nullopt)), + DeleteOption(0, "key1", "ts1", std::nullopt, std::nullopt)), DeleteOptionParm({Condition("card", hybridse::node::FnOperator::kFnOpEq, "key1", type::DataType::kString), Condition("ts1", hybridse::node::FnOperator::kFnOpEq, "0", type::DataType::kBigInt)}, - DeleteOption({{0, "key1"}}, "ts1", 0, std::nullopt)), + DeleteOption(0, "key1", "ts1", 0, std::nullopt)), DeleteOptionParm({Condition("ts1", hybridse::node::FnOperator::kFnOpEq, "10", type::DataType::kBigInt)}, - DeleteOption({}, "ts1", 10, 9)), + DeleteOption(std::nullopt, "", "ts1", 10, 9)), DeleteOptionParm({Condition("ts1", hybridse::node::FnOperator::kFnOpGe, "10", type::DataType::kBigInt)}, - DeleteOption({}, "ts1", std::nullopt, 9)), + DeleteOption(std::nullopt, "", "ts1", std::nullopt, 9)), DeleteOptionParm({Condition("ts1", hybridse::node::FnOperator::kFnOpGe, "10", type::DataType::kBigInt), Condition("ts1", hybridse::node::FnOperator::kFnOpGe, "11", type::DataType::kBigInt)}, - DeleteOption({}, "ts1", std::nullopt, 10)), + DeleteOption(std::nullopt, "", "ts1", std::nullopt, 10)), DeleteOptionParm({Condition("ts1", hybridse::node::FnOperator::kFnOpGe, "10", type::DataType::kBigInt), Condition("ts1", hybridse::node::FnOperator::kFnOpGe, "11", type::DataType::kBigInt)}, - DeleteOption({}, "ts1", std::nullopt, 10)), + DeleteOption(std::nullopt, "", "ts1", std::nullopt, 10)), DeleteOptionParm({Condition("ts1", hybridse::node::FnOperator::kFnOpGe, "10", type::DataType::kBigInt), Condition("ts1", hybridse::node::FnOperator::kFnOpLt, "20", type::DataType::kBigInt)}, - DeleteOption({}, "ts1", 19, 9)), + DeleteOption(std::nullopt, "", "ts1", 19, 9)), DeleteOptionParm({Condition("ts1", hybridse::node::FnOperator::kFnOpGt, "10", type::DataType::kBigInt), Condition("ts1", hybridse::node::FnOperator::kFnOpLt, "20", type::DataType::kBigInt)}, - DeleteOption({}, "ts1", 19, 10)), + DeleteOption(std::nullopt, "", "ts1", 19, 10)), DeleteOptionParm({Condition("ts1", hybridse::node::FnOperator::kFnOpGt, "10", type::DataType::kBigInt), Condition("ts1", hybridse::node::FnOperator::kFnOpLe, "20", type::DataType::kBigInt)}, - DeleteOption({}, "ts1", 20, 10)), + DeleteOption(std::nullopt, "", "ts1", 20, 10)), DeleteOptionParm({Condition("card", hybridse::node::FnOperator::kFnOpEq, "key1", type::DataType::kString), Condition("ts1", hybridse::node::FnOperator::kFnOpGt, "10", type::DataType::kBigInt), Condition("ts1", hybridse::node::FnOperator::kFnOpLe, "20", type::DataType::kBigInt)}, - DeleteOption({{0, "key1"}}, "ts1", 20, 10)) + DeleteOption(0, "key1", "ts1", 20, 10)) }; INSTANTIATE_TEST_SUITE_P(NodeAdapter, DeleteOptionTest, testing::ValuesIn(option_cases)); diff --git a/src/sdk/option.h b/src/sdk/option.h new file mode 100644 index 00000000000..3acb4e30afa --- /dev/null +++ b/src/sdk/option.h @@ -0,0 +1,41 @@ +/* + * Copyright 2021 4Paradigm + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SRC_SDK_OPTION_H_ +#define SRC_SDK_OPTION_H_ + +#include + +namespace openmldb { +namespace sdk { + +struct DeleteOption { + DeleteOption(std::optional idx_i, const std::string& key_i, const std::string& ts_name_i, + std::optional start_ts_i, std::optional end_ts_i) : + idx(idx_i), key(key_i), ts_name(ts_name_i), start_ts(start_ts_i), end_ts(end_ts_i) {} + DeleteOption() = default; + std::optional idx = std::nullopt; + std::string key; + std::string ts_name; + std::optional start_ts = std::nullopt; + std::optional end_ts = std::nullopt; + bool enable_decode_value = true; +}; + +} // namespace sdk +} // namespace openmldb + +#endif // SRC_SDK_OPTION_H_ diff --git a/src/sdk/options.h b/src/sdk/options.h new file mode 100644 index 00000000000..80e7a5c5cfa --- /dev/null +++ b/src/sdk/options.h @@ -0,0 +1,69 @@ +/* + * Copyright 2021 4Paradigm + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SRC_SDK_OPTIONS_H_ +#define SRC_SDK_OPTIONS_H_ + +#include +#include + +namespace openmldb { +namespace sdk { + +struct BasicRouterOptions { + virtual ~BasicRouterOptions() = default; + bool enable_debug = false; + uint32_t max_sql_cache_size = 50; + // == gflag `request_timeout` default value(no gflags here cuz swig) + uint32_t request_timeout = 60000; + // default 0(INFO), INFO, WARNING, ERROR, and FATAL are 0, 1, 2, and 3 + int glog_level = 0; + // empty means to stderr + std::string glog_dir = ""; + std::string user = "root"; + std::string password; +}; + +struct SQLRouterOptions : BasicRouterOptions { + std::string zk_cluster; + std::string zk_path; + uint32_t zk_session_timeout = 2000; + std::string spark_conf_path; + uint32_t zk_log_level = 3; // PY/JAVA SDK default info log + std::string zk_log_file; + std::string zk_auth_schema = "digest"; + std::string zk_cert; + + std::string to_string() { + std::stringstream ss; + ss << "zk options [cluster:" << zk_cluster << ", path:" << zk_path + << ", zk_session_timeout:" << zk_session_timeout + << ", log_level:" << zk_log_level << ", log_file:" << zk_log_file + << ", zk_auth_schema:" << zk_auth_schema << ", zk_cert:" << zk_cert << "]"; + return ss.str(); + } +}; + +struct StandaloneOptions : BasicRouterOptions { + StandaloneOptions() = default; + StandaloneOptions(const std::string& h, uint32_t p) : host(h), port(p) {} + std::string host; + uint32_t port; +}; + +} // namespace sdk +} // namespace openmldb +#endif // SRC_SDK_OPTIONS_H_ diff --git a/src/sdk/query_future_impl.h b/src/sdk/query_future_impl.h new file mode 100644 index 00000000000..5f87a721171 --- /dev/null +++ b/src/sdk/query_future_impl.h @@ -0,0 +1,125 @@ +/* + * Copyright 2021 4Paradigm + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SRC_SDK_QUERY_FUTURE_IMPL_H_ +#define SRC_SDK_QUERY_FUTURE_IMPL_H_ + +#include +#include "proto/tablet.pb.h" +#include "rpc/rpc_client.h" +#include "sdk/base.h" +#include "sdk/result_set_sql.h" +#include "sdk/sql_router.h" + +namespace openmldb { +namespace sdk { + +class QueryFutureImpl : public QueryFuture { + public: + explicit QueryFutureImpl(openmldb::RpcCallback* callback) : callback_(callback) { + if (callback_) { + callback_->Ref(); + } + } + ~QueryFutureImpl() { + if (callback_) { + callback_->UnRef(); + } + } + + std::shared_ptr GetResultSet(hybridse::sdk::Status* status) override { + if (!status) { + return nullptr; + } + if (!callback_ || !callback_->GetResponse() || !callback_->GetController()) { + status->code = hybridse::common::kRpcError; + status->msg = "request error, response or controller null"; + return nullptr; + } + brpc::Join(callback_->GetController()->call_id()); + if (callback_->GetController()->Failed()) { + status->code = hybridse::common::kRpcError; + status->msg = "request error, " + callback_->GetController()->ErrorText(); + return nullptr; + } + if (callback_->GetResponse()->code() != ::openmldb::base::kOk) { + status->code = callback_->GetResponse()->code(); + status->msg = "request error, " + callback_->GetResponse()->msg(); + return nullptr; + } + auto rs = ResultSetSQL::MakeResultSet(callback_->GetResponse(), callback_->GetController(), status); + return rs; + } + + bool IsDone() const override { + if (callback_) return callback_->IsDone(); + return false; + } + + private: + openmldb::RpcCallback* callback_; +}; + +class BatchQueryFutureImpl : public QueryFuture { + public: + explicit BatchQueryFutureImpl(openmldb::RpcCallback* callback) + : callback_(callback) { + if (callback_) { + callback_->Ref(); + } + } + + ~BatchQueryFutureImpl() { + if (callback_) { + callback_->UnRef(); + } + } + + std::shared_ptr GetResultSet(hybridse::sdk::Status* status) override { + if (!status) { + return nullptr; + } + if (!callback_ || !callback_->GetResponse() || !callback_->GetController()) { + status->code = hybridse::common::kRpcError; + status->msg = "request error, response or controller null"; + return nullptr; + } + brpc::Join(callback_->GetController()->call_id()); + if (callback_->GetController()->Failed()) { + status->code = hybridse::common::kRpcError; + status->msg = "request error. " + callback_->GetController()->ErrorText(); + return nullptr; + } + auto rs = std::make_shared(callback_->GetResponse(), + callback_->GetController()); + if (!rs->Init()) { + status->code = -1; + status->msg = "request error, resuletSetSQL init failed"; + return nullptr; + } + return rs; + } + + bool IsDone() const override { return callback_->IsDone(); } + + private: + openmldb::RpcCallback* callback_; +}; + + +} // namespace sdk +} // namespace openmldb +#endif // SRC_SDK_QUERY_FUTURE_IMPL_H_ diff --git a/src/sdk/split_test.cc b/src/sdk/split_test.cc index f4e877ab547..687ba7e7d62 100644 --- a/src/sdk/split_test.cc +++ b/src/sdk/split_test.cc @@ -67,6 +67,15 @@ TEST_P(SplitTest, SplitLineWithDelimiterForStrings) { } } +TEST_F(SplitTest, failedCases) { + // escape + std::vector splited; + // "abc\"", quote is ", should be abc\"(char 4), but this method will return abc\, missing the last char + SplitLineWithDelimiterForStrings("\"abc\\\"\"", ",", &splited, '"'); + ASSERT_EQ(1, splited.size()); + EXPECT_STREQ("abc\\", splited[0].c_str()); +} + } // namespace sdk } // namespace openmldb diff --git a/src/sdk/sql_cache.h b/src/sdk/sql_cache.h index a326437c10f..1fe0b346fa2 100644 --- a/src/sdk/sql_cache.h +++ b/src/sdk/sql_cache.h @@ -54,26 +54,28 @@ class InsertSQLCache : public SQLCache { InsertSQLCache(const std::shared_ptr<::openmldb::nameserver::TableInfo>& table_info, const std::shared_ptr<::hybridse::sdk::Schema>& column_schema, DefaultValueMap default_map, - uint32_t str_length, std::vector hole_idx_arr) + uint32_t str_length, std::vector hole_idx_arr, bool put_if_absent) : SQLCache(table_info->db(), table_info->tid(), table_info->name()), table_info_(table_info), column_schema_(column_schema), default_map_(std::move(default_map)), str_length_(str_length), - hole_idx_arr_(std::move(hole_idx_arr)) {} + hole_idx_arr_(std::move(hole_idx_arr)), + put_if_absent_(put_if_absent) {} std::shared_ptr<::openmldb::nameserver::TableInfo> GetTableInfo() { return table_info_; } std::shared_ptr<::hybridse::sdk::Schema> GetSchema() const { return column_schema_; } uint32_t GetStrLength() const { return str_length_; } const DefaultValueMap& GetDefaultValue() const { return default_map_; } const std::vector& GetHoleIdxArr() const { return hole_idx_arr_; } - + const bool IsPutIfAbsent() const { return put_if_absent_; } private: std::shared_ptr<::openmldb::nameserver::TableInfo> table_info_; std::shared_ptr<::hybridse::sdk::Schema> column_schema_; const DefaultValueMap default_map_; const uint32_t str_length_; const std::vector hole_idx_arr_; + const bool put_if_absent_; }; class RouterSQLCache : public SQLCache { diff --git a/src/sdk/sql_cluster_router.cc b/src/sdk/sql_cluster_router.cc index 7c5a98814b9..3dc11369fea 100644 --- a/src/sdk/sql_cluster_router.cc +++ b/src/sdk/sql_cluster_router.cc @@ -42,6 +42,7 @@ #include "boost/property_tree/ptree.hpp" #include "brpc/channel.h" #include "cmd/display.h" +#include "codec/encrypt.h" #include "common/timer.h" #include "glog/logging.h" #include "nameserver/system_table.h" @@ -55,6 +56,7 @@ #include "sdk/batch_request_result_set_sql.h" #include "sdk/job_table_helper.h" #include "sdk/node_adapter.h" +#include "sdk/query_future_impl.h" #include "sdk/result_set_sql.h" #include "sdk/sdk_util.h" #include "sdk/split.h" @@ -114,100 +116,6 @@ class ExplainInfoImpl : public ExplainInfo { std::string request_name_; }; -class QueryFutureImpl : public QueryFuture { - public: - explicit QueryFutureImpl(openmldb::RpcCallback* callback) : callback_(callback) { - if (callback_) { - callback_->Ref(); - } - } - ~QueryFutureImpl() { - if (callback_) { - callback_->UnRef(); - } - } - - std::shared_ptr GetResultSet(hybridse::sdk::Status* status) override { - if (!status) { - return nullptr; - } - if (!callback_ || !callback_->GetResponse() || !callback_->GetController()) { - status->code = hybridse::common::kRpcError; - status->msg = "request error, response or controller null"; - return nullptr; - } - brpc::Join(callback_->GetController()->call_id()); - if (callback_->GetController()->Failed()) { - status->code = hybridse::common::kRpcError; - status->msg = "request error, " + callback_->GetController()->ErrorText(); - return nullptr; - } - if (callback_->GetResponse()->code() != ::openmldb::base::kOk) { - status->code = callback_->GetResponse()->code(); - status->msg = "request error, " + callback_->GetResponse()->msg(); - return nullptr; - } - auto rs = ResultSetSQL::MakeResultSet(callback_->GetResponse(), callback_->GetController(), status); - return rs; - } - - bool IsDone() const override { - if (callback_) return callback_->IsDone(); - return false; - } - - private: - openmldb::RpcCallback* callback_; -}; - -class BatchQueryFutureImpl : public QueryFuture { - public: - explicit BatchQueryFutureImpl(openmldb::RpcCallback* callback) - : callback_(callback) { - if (callback_) { - callback_->Ref(); - } - } - - ~BatchQueryFutureImpl() { - if (callback_) { - callback_->UnRef(); - } - } - - std::shared_ptr GetResultSet(hybridse::sdk::Status* status) override { - if (!status) { - return nullptr; - } - if (!callback_ || !callback_->GetResponse() || !callback_->GetController()) { - status->code = hybridse::common::kRpcError; - status->msg = "request error, response or controller null"; - return nullptr; - } - brpc::Join(callback_->GetController()->call_id()); - if (callback_->GetController()->Failed()) { - status->code = hybridse::common::kRpcError; - status->msg = "request error. " + callback_->GetController()->ErrorText(); - return nullptr; - } - std::shared_ptr<::openmldb::sdk::SQLBatchRequestResultSet> rs = - std::make_shared(callback_->GetResponse(), - callback_->GetController()); - bool ok = rs->Init(); - if (!ok) { - status->code = -1; - status->msg = "request error, resuletSetSQL init failed"; - return nullptr; - } - return rs; - } - - bool IsDone() const override { return callback_->IsDone(); } - - private: - openmldb::RpcCallback* callback_; -}; - SQLClusterRouter::SQLClusterRouter(const SQLRouterOptions& options) : options_(std::make_shared(options)), is_cluster_mode_(true), @@ -231,11 +139,7 @@ SQLClusterRouter::SQLClusterRouter(DBSDK* sdk) cluster_sdk_(sdk), mu_(), rand_(::baidu::common::timer::now_time()) { - if (is_cluster_mode_) { - options_ = std::make_shared(); - } else { - options_ = std::make_shared(); - } + options_ = sdk->GetOptions(); } SQLClusterRouter::~SQLClusterRouter() { delete cluster_sdk_; } @@ -253,15 +157,7 @@ bool SQLClusterRouter::Init() { // init cluster_sdk_, require options_ or standalone_options_ is set if (is_cluster_mode_) { auto ops = std::dynamic_pointer_cast(options_); - ClusterOptions coptions; - coptions.zk_cluster = ops->zk_cluster; - coptions.zk_path = ops->zk_path; - coptions.zk_session_timeout = ops->zk_session_timeout; - coptions.zk_log_level = ops->zk_log_level; - coptions.zk_log_file = ops->zk_log_file; - coptions.zk_auth_schema = ops->zk_auth_schema; - coptions.zk_cert = ops->zk_cert; - cluster_sdk_ = new ClusterSDK(coptions); + cluster_sdk_ = new ClusterSDK(ops); // TODO(hw): no detail error info bool ok = cluster_sdk_->Init(); if (!ok) { @@ -270,35 +166,13 @@ bool SQLClusterRouter::Init() { } } else { auto ops = std::dynamic_pointer_cast(options_); - cluster_sdk_ = new ::openmldb::sdk::StandAloneSDK(ops->host, ops->port); + cluster_sdk_ = new ::openmldb::sdk::StandAloneSDK(ops); bool ok = cluster_sdk_->Init(); if (!ok) { LOG(WARNING) << "fail to init standalone sdk"; return false; } } - } else { - // init options_ or standalone_options_ if fileds not filled, they should be consistent with cluster_sdk_ - // - // might better to refactor constructors & fileds for SQLClusterRouter - // but will introduce breaking changes as well - if (is_cluster_mode_) { - auto ops = std::dynamic_pointer_cast(options_); - if (ops->zk_cluster.empty() || ops->zk_path.empty()) { - auto* cluster_sdk = dynamic_cast(cluster_sdk_); - DCHECK(cluster_sdk != nullptr); - ops->zk_cluster = cluster_sdk->GetClusterOptions().zk_cluster; - ops->zk_path = cluster_sdk->GetClusterOptions().zk_path; - } - } else { - auto ops = std::dynamic_pointer_cast(options_); - if (ops->host.empty() || ops->port == 0) { - auto* standalone_sdk = dynamic_cast(cluster_sdk_); - DCHECK(standalone_sdk != nullptr); - ops->host = standalone_sdk->GetHost(); - ops->port = standalone_sdk->GetPort(); - } - } } std::string db = openmldb::nameserver::INFORMATION_SCHEMA_DB; @@ -323,6 +197,40 @@ bool SQLClusterRouter::Init() { session_variables_.emplace("insert_memory_usage_limit", "0"); session_variables_.emplace("spark_config", ""); } + return Auth(); +} + +bool SQLClusterRouter::Auth() { + auto ns_client = cluster_sdk_->GetNsClient(); + std::vector<::openmldb::nameserver::TableInfo> tables; + std::string msg; + auto ok = ns_client->ShowTable(nameserver::USER_INFO_NAME, nameserver::INTERNAL_DB, false, tables, msg); + if (!ok) { + LOG(WARNING) << "fail to get table from nameserver. error msg: " << msg; + return false; + } + if (tables.empty()) { + return true; + } + UserInfo info; + auto result = GetUser(options_->user, &info); + if (result.ok()) { + if (!(*result)) { + if (options_->user == "root") { + return true; + } + LOG(WARNING) << "user " << options_->user << " does not exist"; + return false; + } + auto password = options_->password.empty() ? options_->password : codec::Encrypt(options_->password); + if (info.password != password) { + LOG(WARNING) << "wrong password!"; + return false; + } + } else { + LOG(WARNING) << result.status(); + return false; + } return true; } @@ -455,39 +363,40 @@ std::shared_ptr SQLClusterRouter::GetInsertRow(const std::string& *status = {}; return std::make_shared(insert_cache->GetTableInfo(), insert_cache->GetSchema(), insert_cache->GetDefaultValue(), insert_cache->GetStrLength(), - insert_cache->GetHoleIdxArr()); + insert_cache->GetHoleIdxArr(), insert_cache->IsPutIfAbsent()); } } std::shared_ptr<::openmldb::nameserver::TableInfo> table_info; DefaultValueMap default_map; uint32_t str_length = 0; std::vector stmt_column_idx_arr; - if (!GetInsertInfo(db, sql, status, &table_info, &default_map, &str_length, &stmt_column_idx_arr)) { - SET_STATUS_AND_WARN(status, StatusCode::kCmdError, "get insert information failed"); + bool put_if_absent = false; + if (!GetInsertInfo(db, sql, status, &table_info, &default_map, &str_length, &stmt_column_idx_arr, &put_if_absent)) { + PREPEND_AND_WARN(status, "fail to get insert info"); return {}; } auto schema = openmldb::schema::SchemaAdapter::ConvertSchema(table_info->column_desc()); - auto insert_cache = - std::make_shared(table_info, schema, default_map, str_length, - SQLInsertRow::GetHoleIdxArr(default_map, stmt_column_idx_arr, schema)); + auto insert_cache = std::make_shared( + table_info, schema, default_map, str_length, + SQLInsertRow::GetHoleIdxArr(default_map, stmt_column_idx_arr, schema), put_if_absent); SetCache(db, sql, hybridse::vm::kBatchMode, insert_cache); *status = {}; return std::make_shared(insert_cache->GetTableInfo(), insert_cache->GetSchema(), insert_cache->GetDefaultValue(), insert_cache->GetStrLength(), - insert_cache->GetHoleIdxArr()); + insert_cache->GetHoleIdxArr(), insert_cache->IsPutIfAbsent()); } bool SQLClusterRouter::GetMultiRowInsertInfo(const std::string& db, const std::string& sql, ::hybridse::sdk::Status* status, std::shared_ptr<::openmldb::nameserver::TableInfo>* table_info, std::vector* default_maps, - std::vector* str_lengths) { + std::vector* str_lengths, bool* put_if_absent) { RET_FALSE_IF_NULL_AND_WARN(status, "output status is nullptr"); // TODO(hw): return status? RET_FALSE_IF_NULL_AND_WARN(table_info, "output table_info is nullptr"); RET_FALSE_IF_NULL_AND_WARN(default_maps, "output default_maps is nullptr"); RET_FALSE_IF_NULL_AND_WARN(str_lengths, "output str_lengths is nullptr"); - + RET_FALSE_IF_NULL_AND_WARN(put_if_absent, "output put_if_absent is nullptr"); ::hybridse::node::NodeManager nm; ::hybridse::plan::PlanNodeList plans; bool ok = GetSQLPlan(sql, &nm, &plans); @@ -506,6 +415,7 @@ bool SQLClusterRouter::GetMultiRowInsertInfo(const std::string& db, const std::s SET_STATUS_AND_WARN(status, StatusCode::kPlanError, "insert stmt is null"); return false; } + *put_if_absent = insert_stmt->insert_mode_ == ::hybridse::node::InsertStmt::IGNORE; std::string db_name; if (!insert_stmt->db_name_.empty()) { db_name = insert_stmt->db_name_; @@ -576,7 +486,7 @@ bool SQLClusterRouter::GetMultiRowInsertInfo(const std::string& db, const std::s bool SQLClusterRouter::GetInsertInfo(const std::string& db, const std::string& sql, ::hybridse::sdk::Status* status, std::shared_ptr<::openmldb::nameserver::TableInfo>* table_info, DefaultValueMap* default_map, uint32_t* str_length, - std::vector* stmt_column_idx_in_table) { + std::vector* stmt_column_idx_in_table, bool* put_if_absent) { RET_FALSE_IF_NULL_AND_WARN(status, "output status is nullptr"); RET_FALSE_IF_NULL_AND_WARN(table_info, "output table_info is nullptr"); RET_FALSE_IF_NULL_AND_WARN(default_map, "output default_map is nullptr"); @@ -636,6 +546,7 @@ bool SQLClusterRouter::GetInsertInfo(const std::string& db, const std::string& s SET_STATUS_AND_WARN(status, StatusCode::kCmdError, "get default value map of " + sql + " failed"); return false; } + *put_if_absent = insert_stmt->insert_mode_ == ::hybridse::node::InsertStmt::IGNORE; return true; } @@ -771,23 +682,24 @@ std::shared_ptr SQLClusterRouter::GetInsertRows(const std::string status->SetOK(); return std::make_shared(insert_cache->GetTableInfo(), insert_cache->GetSchema(), insert_cache->GetDefaultValue(), insert_cache->GetStrLength(), - insert_cache->GetHoleIdxArr()); + insert_cache->GetHoleIdxArr(), insert_cache->IsPutIfAbsent()); } } std::shared_ptr<::openmldb::nameserver::TableInfo> table_info; DefaultValueMap default_map; uint32_t str_length = 0; std::vector stmt_column_idx_arr; - if (!GetInsertInfo(db, sql, status, &table_info, &default_map, &str_length, &stmt_column_idx_arr)) { + bool put_if_absent = false; + if (!GetInsertInfo(db, sql, status, &table_info, &default_map, &str_length, &stmt_column_idx_arr, &put_if_absent)) { return {}; } auto col_schema = openmldb::schema::SchemaAdapter::ConvertSchema(table_info->column_desc()); - insert_cache = - std::make_shared(table_info, col_schema, default_map, str_length, - SQLInsertRow::GetHoleIdxArr(default_map, stmt_column_idx_arr, col_schema)); + insert_cache = std::make_shared( + table_info, col_schema, default_map, str_length, + SQLInsertRow::GetHoleIdxArr(default_map, stmt_column_idx_arr, col_schema), put_if_absent); SetCache(db, sql, hybridse::vm::kBatchMode, insert_cache); return std::make_shared(table_info, insert_cache->GetSchema(), default_map, str_length, - insert_cache->GetHoleIdxArr()); + insert_cache->GetHoleIdxArr(), insert_cache->IsPutIfAbsent()); } bool SQLClusterRouter::ExecuteDDL(const std::string& db, const std::string& sql, hybridse::sdk::Status* status) { @@ -1303,7 +1215,8 @@ bool SQLClusterRouter::ExecuteInsert(const std::string& db, const std::string& s std::shared_ptr<::openmldb::nameserver::TableInfo> table_info; std::vector default_maps; std::vector str_lengths; - if (!GetMultiRowInsertInfo(db, sql, status, &table_info, &default_maps, &str_lengths)) { + bool put_if_absent = false; + if (!GetMultiRowInsertInfo(db, sql, status, &table_info, &default_maps, &str_lengths, &put_if_absent)) { CODE_PREPEND_AND_WARN(status, StatusCode::kCmdError, "Fail to get insert info"); return false; } @@ -1318,18 +1231,9 @@ bool SQLClusterRouter::ExecuteInsert(const std::string& db, const std::string& s } std::vector fails; for (size_t i = 0; i < default_maps.size(); i++) { - auto row = std::make_shared(table_info, schema, default_maps[i], str_lengths[i]); - if (!row) { - LOG(WARNING) << "fail to parse row[" << i << "]"; - fails.push_back(i); - continue; - } - if (!row->Init(0)) { - LOG(WARNING) << "fail to encode row[" << i << " for table " << table_info->name(); - fails.push_back(i); - continue; - } - if (!row->IsComplete()) { + auto row = std::make_shared(table_info, schema, default_maps[i], str_lengths[i], put_if_absent); + if (!row || !row->Init(0) || !row->IsComplete()) { + // TODO(hw): SQLInsertRow or DefaultValueMap needs print helper function LOG(WARNING) << "fail to build row[" << i << "]"; fails.push_back(i); continue; @@ -1367,19 +1271,20 @@ bool SQLClusterRouter::PutRow(uint32_t tid, const std::shared_ptr& if (client) { DLOG(INFO) << "put data to endpoint " << client->GetEndpoint() << " with dimensions size " << kv.second.size(); - auto ret = client->Put(tid, pid, cur_ts, row->GetRow(), kv.second, - insert_memory_usage_limit_.load(std::memory_order_relaxed)); + auto ret = + client->Put(tid, pid, cur_ts, row->GetRow(), kv.second, + insert_memory_usage_limit_.load(std::memory_order_relaxed), row->IsPutIfAbsent()); if (!ret.OK()) { - if (RevertPut(row->GetTableInfo(), pid, dimensions, cur_ts, - base::Slice(row->GetRow()), tablets).IsOK()) { + if (RevertPut(row->GetTableInfo(), pid, dimensions, cur_ts, base::Slice(row->GetRow()), tablets) + .IsOK()) { SET_STATUS_AND_WARN(status, StatusCode::kCmdError, - absl::StrCat("INSERT failed, tid ", tid)); + absl::StrCat("INSERT failed, tid ", tid)); } else { SET_STATUS_AND_WARN(status, StatusCode::kCmdError, - "INSERT failed, tid " + std::to_string(tid) + - ". Note that data might have been partially inserted. " - "You are encouraged to perform DELETE to remove any partially " - "inserted data before trying INSERT again."); + "INSERT failed, tid " + std::to_string(tid) + + ". Note that data might have been partially inserted. " + "You are encouraged to perform DELETE to remove any partially " + "inserted data before trying INSERT again."); } return false; } @@ -1449,7 +1354,8 @@ bool SQLClusterRouter::ExecuteInsert(const std::string& db, const std::string& s bool SQLClusterRouter::ExecuteInsert(const std::string& db, const std::string& name, int tid, int partition_num, hybridse::sdk::ByteArrayPtr dimension, int dimension_len, - hybridse::sdk::ByteArrayPtr value, int len, hybridse::sdk::Status* status) { + hybridse::sdk::ByteArrayPtr value, int len, bool put_if_absent, + hybridse::sdk::Status* status) { RET_FALSE_IF_NULL_AND_WARN(status, "output status is nullptr"); if (dimension == nullptr || dimension_len <= 0 || value == nullptr || len <= 0 || partition_num <= 0) { *status = {StatusCode::kCmdError, "invalid parameter"}; @@ -1491,13 +1397,14 @@ bool SQLClusterRouter::ExecuteInsert(const std::string& db, const std::string& n DLOG(INFO) << "put data to endpoint " << client->GetEndpoint() << " with dimensions size " << kv.second.size(); auto ret = client->Put(tid, pid, cur_ts, row_value, &kv.second, - insert_memory_usage_limit_.load(std::memory_order_relaxed)); + insert_memory_usage_limit_.load(std::memory_order_relaxed), put_if_absent); if (!ret.OK()) { + // TODO(hw): show put failed row(readable)? ::hybridse::codec::RowView::GetRowString? SET_STATUS_AND_WARN(status, StatusCode::kCmdError, - "INSERT failed, tid " + std::to_string(tid) + - ". Note that data might have been partially inserted. " - "You are encouraged to perform DELETE to remove any partially " - "inserted data before trying INSERT again."); + "INSERT failed, tid " + std::to_string(tid) + + ". Note that data might have been partially inserted. " + "You are encouraged to perform DELETE to remove any partially " + "inserted data before trying INSERT again."); std::map>> dimensions; for (const auto& val : dimensions_map) { std::vector> vec; @@ -1510,9 +1417,10 @@ bool SQLClusterRouter::ExecuteInsert(const std::string& db, const std::string& n if (!table_info) { return false; } + // TODO(hw): better to return absl::Status if (RevertPut(*table_info, pid, dimensions, cur_ts, row_value, tablets).IsOK()) { SET_STATUS_AND_WARN(status, StatusCode::kCmdError, - absl::StrCat("INSERT failed, tid ", tid)); + absl::StrCat("INSERT failed, tid ", tid)); } return false; } @@ -1723,6 +1631,11 @@ std::shared_ptr SQLClusterRouter::HandleSQLCmd(const h return ResultSetSQL::MakeResultSet({"Tables"}, values, status); } + case hybridse::node::kCmdShowUser: { + std::vector value = { options_->user }; + return ResultSetSQL::MakeResultSet({"User"}, {value}, status); + } + case hybridse::node::kCmdShowCreateTable: { auto& args = cmd_node->GetArgs(); std::string cur_db = db; @@ -1819,6 +1732,23 @@ std::shared_ptr SQLClusterRouter::HandleSQLCmd(const h } return {}; } + case hybridse::node::kCmdDropUser: { + std::string name = cmd_node->GetArgs()[0]; + if (cmd_node->IsIfExists()) { + *status = DeleteUser(name); + } else { + UserInfo user_info; + auto result = GetUser(name, &user_info); + if (!result.ok()) { + *status = {StatusCode::kCmdError, result.status().message()}; + } else if (!(*result)) { + *status = {StatusCode::kCmdError, absl::StrCat("user ", name, " does not exist")}; + } else { + *status = DeleteUser(name); + } + } + return {}; + } case hybridse::node::kCmdShowFunctions: { std::vector<::openmldb::common::ExternalFun> funs; base::Status st = ns_ptr->ShowFunction("", &funs); @@ -2116,7 +2046,9 @@ std::shared_ptr SQLClusterRouter::HandleSQLCmd(const h const auto& args = cmd_node->GetArgs(); return ExecuteShowTableStatus(db, args.size() > 0 ? args[0] : "", status); } - default: { *status = {StatusCode::kCmdError, "fail to execute script with unsupported type"}; } + default: { + *status = {StatusCode::kCmdError, "fail to execute script with unsupported type"}; + } } return {}; } @@ -2730,6 +2662,56 @@ std::shared_ptr SQLClusterRouter::ExecuteSQL( } return {}; } + case hybridse::node::kPlanTypeCreateUser: { + auto create_node = dynamic_cast(node); + UserInfo user_info;; + auto result = GetUser(create_node->Name(), &user_info); + if (!result.ok()) { + *status = {StatusCode::kCmdError, result.status().message()}; + } else if (*result) { + if (!create_node->IfNotExists()) { + *status = {StatusCode::kCmdError, absl::StrCat("user ", create_node->Name(), " already exists")}; + } + } else { + std::string password; + if (create_node->Options()) { + auto ret = NodeAdapter::ExtractUserOption(*create_node->Options()); + if (!ret.ok()) { + *status = {StatusCode::kCmdError, ret.status().message()}; + return {}; + } + password = *ret; + } + *status = AddUser(create_node->Name(), password); + } + return {}; + } + case hybridse::node::kPlanTypeAlterUser: { + auto alter_node = dynamic_cast(node); + UserInfo user_info; + auto result = GetUser(alter_node->Name(), &user_info); + if (!result.ok()) { + *status = {StatusCode::kCmdError, result.status().message()}; + return {}; + } else if (!(*result)) { + if (!alter_node->IfExists() && alter_node->Name() != "root") { + *status = {StatusCode::kCmdError, absl::StrCat("user ", alter_node->Name(), " does not exists")}; + return {}; + } + user_info.name = "root"; + user_info.create_time = ::baidu::common::timer::get_micros() / 1000; + user_info.privileges = "ALL"; + } + if (alter_node->Options() && !alter_node->Options()->empty()) { + auto ret = NodeAdapter::ExtractUserOption(*alter_node->Options()); + if (!ret.ok()) { + *status = {StatusCode::kCmdError, ret.status().message()}; + return {}; + } + *status = UpdateUser(user_info, *ret); + } + return {}; + } case hybridse::node::kPlanTypeCreateIndex: { auto create_index_plan_node = dynamic_cast(node); auto create_index_node = create_index_plan_node->create_index_node_; @@ -2824,6 +2806,7 @@ std::shared_ptr SQLClusterRouter::ExecuteSQL( ::openmldb::taskmanager::JobInfo job_info; std::map config = ParseSparkConfigString(GetSparkConfig()); ReadSparkConfFromFile(std::dynamic_pointer_cast(options_)->spark_conf_path, &config); + AddUserToConfig(&config); auto base_status = ExportOfflineData(sql, config, db, is_sync_job, offline_job_timeout, &job_info); if (base_status.OK()) { @@ -2855,7 +2838,7 @@ std::shared_ptr SQLClusterRouter::ExecuteSQL( if (!cluster_sdk_->IsClusterMode() || is_local.value()) { if (cluster_sdk_->IsClusterMode() && !IsOnlineMode()) { *status = {::hybridse::common::StatusCode::kCmdError, - "local load only supports loading data to online storage"}; + "local load only supports loading data to online storage"}; return {}; } @@ -2875,18 +2858,19 @@ std::shared_ptr SQLClusterRouter::ExecuteSQL( *status = {::hybridse::common::StatusCode::kCmdError, st.ToString()}; return {}; } - // Load data locally + // Load data locally, report error in status *status = HandleLoadDataInfile(database, plan->Table(), plan->File(), options_parser); } else { // Load data using Spark ::openmldb::taskmanager::JobInfo job_info; std::map config = ParseSparkConfigString(GetSparkConfig()); ReadSparkConfFromFile(std::dynamic_pointer_cast(options_)->spark_conf_path, &config); + AddUserToConfig(&config); ::openmldb::base::Status base_status; if (is_online_mode) { // Handle in online mode - config.emplace("insert_memory_usage_limit", + config.emplace("spark.insert_memory_usage_limit", std::to_string(insert_memory_usage_limit_.load(std::memory_order_relaxed))); base_status = ImportOnlineData(sql, config, database, is_sync_job, offline_job_timeout, &job_info); } else { @@ -3030,6 +3014,7 @@ std::shared_ptr SQLClusterRouter::ExecuteOfflineQuery( RET_IF_NULL_AND_WARN(status, "output status is nullptr"); std::map config = ParseSparkConfigString(GetSparkConfig()); ReadSparkConfFromFile(std::dynamic_pointer_cast(options_)->spark_conf_path, &config); + AddUserToConfig(&config); if (is_sync_job) { // Run offline sql and wait to get output @@ -3311,7 +3296,7 @@ hybridse::sdk::Status SQLClusterRouter::HandleLoadDataInfile( auto thread = options_parser.GetAs("thread"); if (!thread.ok()) { - return {StatusCode::kCmdError, "thread option get failed" + options_parser.ToString()}; + return {StatusCode::kCmdError, "thread option get failed " + options_parser.ToString()}; } auto thread_num = thread.value(); std::vector counts(thread_num); @@ -3383,27 +3368,30 @@ hybridse::sdk::Status SQLClusterRouter::LoadDataSingleFile(int id, int step, con std::vector cols; auto deli = options_parser.GetAs("delimiter"); auto quote = options_parser.GetAs("quote"); - if (!deli.ok() || !quote.ok()) { - return {StatusCode::kCmdError, "delimiter/quote option get failed" + options_parser.ToString()}; + auto null_value = options_parser.GetAs("null_value"); + if (!deli.ok() || !quote.ok() || !null_value.ok()) { + return {StatusCode::kCmdError, "delimiter/quote/null_value option get failed " + options_parser.ToString()}; } + // peek the first line, check the column size and if it's a header ::openmldb::sdk::SplitLineWithDelimiterForStrings(line, deli.value(), &cols, quote.value().empty() ? '\0' : quote.value()[0]); auto schema = GetTableSchema(database, table); if (!schema) { - return {StatusCode::kCmdError, "table does not exist"}; + return {StatusCode::kTableNotFound, "table does not exist"}; } if (static_cast(cols.size()) != schema->GetColumnCnt()) { return {StatusCode::kCmdError, "mismatch column size"}; } auto header = options_parser.GetAs("header"); if (!header.ok()) { - return {StatusCode::kCmdError, "header option get failed" + options_parser.ToString()}; + return {StatusCode::kCmdError, "header option get failed " + options_parser.ToString()}; } if (header.value()) { // the first line is the column names, check if equal with table schema for (int i = 0; i < schema->GetColumnCnt(); ++i) { if (cols[i] != schema->GetColumnName(i)) { - return {StatusCode::kCmdError, "mismatch column name"}; + return {StatusCode::kCmdError, absl::StrCat("mismatch column name ", cols[i], " ", + schema->GetColumnName(i), " in file ", file_path)}; } } // then read the first row of data @@ -3423,19 +3411,12 @@ hybridse::sdk::Status SQLClusterRouter::LoadDataSingleFile(int id, int step, con str_cols_idx.emplace_back(i); } } + int64_t i = 0; do { // only process the line assigned to its own id if (i % step == id) { cols.clear(); - std::string error; - auto deli = options_parser.GetAs("delimiter"); - auto quote = options_parser.GetAs("quote"); - auto null_value = options_parser.GetAs("null_value"); - if (!deli.ok() || !quote.ok() || !null_value.ok()) { - return {StatusCode::kCmdError, - "delimiter/quote/null_value option get failed" + options_parser.ToString()}; - } ::openmldb::sdk::SplitLineWithDelimiterForStrings(line, deli.value(), &cols, quote.value().empty() ? '\0' : quote.value()[0]); auto ret = InsertOneRow(database, insert_placeholder, str_cols_idx, null_value.value(), cols); @@ -3483,9 +3464,14 @@ hybridse::sdk::Status SQLClusterRouter::InsertOneRow(const std::string& database for (int i = 0; i < cnt; ++i) { if (!::openmldb::codec::AppendColumnValue(cols[i], schema->GetColumnType(i), schema->IsColumnNotNull(i), null_value, row)) { - return {StatusCode::kCmdError, "translate to insert row failed"}; + return {StatusCode::kCmdError, absl::StrCat("translate failed on column ", schema->GetColumnName(i), "(", i, + ") with value ", cols[i])}; } } + if (!row->IsComplete()) { + return {StatusCode::kCmdError, "row is not complete: " + absl::StrJoin(cols, ",")}; + } + if (!ExecuteInsert(database, insert_placeholder, row, &status)) { RETURN_NOT_OK_PREPEND(status, "insert row failed"); } @@ -3521,14 +3507,13 @@ hybridse::sdk::Status SQLClusterRouter::HandleDelete(const std::string& db, cons if (!status.IsOK()) { return status; } - status = SendDeleteRequst(table_info, &option); - if (status.IsOK()) { + status = SendDeleteRequst(table_info, option); + if (status.IsOK() && db != nameserver::INTERNAL_DB) { status = { StatusCode::kOk, "DELETE is a dangerous operation. Once deleted, it is very difficult to recover. You may also note that:\n" "- The deleted data will not be released immediately from the main memory; " "it remains until after a garbage collection interval (gc_interval)\n" - "- Data in the pre-aggregation table will not be updated.\n" "Please refer to this link for more details: " + base::NOTICE_URL}; } @@ -3536,8 +3521,8 @@ hybridse::sdk::Status SQLClusterRouter::HandleDelete(const std::string& db, cons } hybridse::sdk::Status SQLClusterRouter::SendDeleteRequst( - const std::shared_ptr<::openmldb::nameserver::TableInfo>& table_info, const DeleteOption* option) { - if (option->index_map.empty()) { + const std::shared_ptr<::openmldb::nameserver::TableInfo>& table_info, const DeleteOption& option) { + if (!option.idx.has_value()) { std::vector> tablets; if (!cluster_sdk_->GetTablet(table_info->db(), table_info->name(), &tablets)) { return {StatusCode::kCmdError, "get tablet failed"}; @@ -3547,39 +3532,29 @@ hybridse::sdk::Status SQLClusterRouter::SendDeleteRequst( return {StatusCode::kCmdError, "cannot connect tablet"}; } } - for (size_t idx = 0; idx < tablets.size(); idx++) { - auto tablet_client = tablets.at(idx)->GetClient(); - if (auto status = tablet_client->Delete(table_info->tid(), idx, option->index_map, option->ts_name, - option->start_ts, option->end_ts); - !status.OK()) { - return {StatusCode::kCmdError, status.GetMsg()}; - } - } - } else { - std::map> pid_index_map; - for (const auto& kv : option->index_map) { - uint32_t pid = ::openmldb::base::hash64(kv.second) % table_info->table_partition_size(); - auto iter = pid_index_map.find(pid); - if (iter == pid_index_map.end()) { - iter = pid_index_map.emplace(pid, std::map()).first; - } - iter->second.emplace(kv.first, kv.second); - } - for (const auto& kv : pid_index_map) { - auto tablet = cluster_sdk_->GetTablet(table_info->db(), table_info->name(), kv.first); - if (!tablet) { - return {StatusCode::kCmdError, "cannot connect tablet"}; - } - auto tablet_client = tablet->GetClient(); + for (size_t pid = 0; pid < tablets.size(); pid++) { + auto tablet_client = tablets.at(pid)->GetClient(); if (!tablet_client) { return {StatusCode::kCmdError, "tablet client is null"}; } - auto ret = tablet_client->Delete(table_info->tid(), kv.first, kv.second, option->ts_name, option->start_ts, - option->end_ts); + auto ret = tablet_client->Delete(table_info->tid(), pid, option, options_->request_timeout); if (!ret.OK()) { return {StatusCode::kCmdError, ret.GetMsg()}; } } + } else { + uint32_t pid = ::openmldb::base::hash64(option.key) % table_info->table_partition_size(); + auto tablet = cluster_sdk_->GetTablet(table_info->db(), table_info->name(), pid); + if (!tablet) { + return {StatusCode::kCmdError, "cannot connect tablet"}; + } + auto tablet_client = tablet->GetClient(); + if (!tablet_client) { + return {StatusCode::kCmdError, "tablet client is null"}; + } + if (auto ret = tablet_client->Delete(table_info->tid(), pid, option, options_->request_timeout); !ret.OK()) { + return {StatusCode::kCmdError, ret.GetMsg()}; + } } return {}; } @@ -3603,7 +3578,7 @@ bool SQLClusterRouter::ExecuteDelete(std::shared_ptr row, hybridse if (!status->IsOK()) { return false; } - *status = SendDeleteRequst(table_info, &option); + *status = SendDeleteRequst(table_info, option); return status->IsOK(); } @@ -3794,8 +3769,8 @@ hybridse::sdk::Status SQLClusterRouter::HandleDeploy(const std::string& db, return get_index_status; } std::stringstream index_stream; - for (auto[db, db_map] : new_index_map) { - for (auto[table, index_list] : db_map) { + for (auto [db, db_map] : new_index_map) { + for (auto [table, index_list] : db_map) { for (auto index : index_list) { index_stream << db << "-" << table << "-"; for (auto col : index.col_name()) { @@ -4776,6 +4751,81 @@ std::shared_ptr SQLClusterRouter::GetNameServerJobResu return rs; } +absl::StatusOr SQLClusterRouter::GetUser(const std::string& name, UserInfo* user_info) { + std::string sql = absl::StrCat("select * from ", nameserver::USER_INFO_NAME); + hybridse::sdk::Status status; + auto rs = ExecuteSQLParameterized(nameserver::INTERNAL_DB, sql, + std::shared_ptr(), &status); + if (rs == nullptr) { + return absl::InternalError(status.msg); + } + while (rs->Next()) { + if (rs->GetStringUnsafe(1) == name) { + user_info->name = name; + user_info->password = rs->GetStringUnsafe(2); + user_info->create_time = rs->GetTimeUnsafe(5); + user_info->update_time = rs->GetTimeUnsafe(6); + return true; + } + } + return false; +} + +hybridse::sdk::Status SQLClusterRouter::AddUser(const std::string& name, const std::string& password) { + auto real_password = password.empty() ? password : codec::Encrypt(password); + uint64_t cur_ts = ::baidu::common::timer::get_micros() / 1000; + std::string sql = absl::StrCat("insert into ", nameserver::USER_INFO_NAME, " values (", + "'%',", // host + "'", name, "','", // user + real_password, "',", // password + cur_ts, ",", // password_last_changed + "0,", // password_expired_time + cur_ts, ", ", // create_time + cur_ts, ",", // update_time + 1, // account_type + ",'',", // privileges + "null" // extra_info + ");"); + hybridse::sdk::Status status; + ExecuteInsert(nameserver::INTERNAL_DB, sql, &status); + return status; +} + +hybridse::sdk::Status SQLClusterRouter::UpdateUser(const UserInfo& user_info, const std::string& password) { + auto real_password = password.empty() ? password : codec::Encrypt(password); + uint64_t cur_ts = ::baidu::common::timer::get_micros() / 1000; + std::string sql = absl::StrCat("insert into ", nameserver::USER_INFO_NAME, " values (", + "'%',", // host + "'", user_info.name, "','", // user + real_password, "',", // password + cur_ts, ",", // password_last_changed + "0,", // password_expired_time + user_info.create_time, ", ", // create_time + cur_ts, ",", // update_time + 1, // account_type + ",'", user_info.privileges, "',", // privileges + "null" // extra_info + ");"); + hybridse::sdk::Status status; + ExecuteInsert(nameserver::INTERNAL_DB, sql, &status); + return status; +} + +hybridse::sdk::Status SQLClusterRouter::DeleteUser(const std::string& name) { + std::string sql = absl::StrCat("delete from ", nameserver::USER_INFO_NAME, + " where host = '%' and user = '", name, "';"); + hybridse::sdk::Status status; + ExecuteSQL(nameserver::INTERNAL_DB, sql, &status); + return status; +} + +void SQLClusterRouter::AddUserToConfig(std::map* config) { + config->emplace("spark.openmldb.user", GetRouterOptions()->user); + if (!GetRouterOptions()->password.empty()) { + config->emplace("spark.openmldb.password", GetRouterOptions()->password); + } +} + ::hybridse::sdk::Status SQLClusterRouter::RevertPut(const nameserver::TableInfo& table_info, uint32_t end_pid, const std::map>>& dimensions, @@ -4790,8 +4840,8 @@ ::hybridse::sdk::Status SQLClusterRouter::RevertPut(const nameserver::TableInfo& const int8_t* data = reinterpret_cast(value.data()); for (const auto& kv : dimensions) { if (static_cast(kv.first) > tablets.size()) { - return {StatusCode::kCmdError, absl::StrCat("pid ", kv.first, - " is greater than the tablets size ", tablets.size())}; + return {StatusCode::kCmdError, + absl::StrCat("pid ", kv.first, " is greater than the tablets size ", tablets.size())}; } auto tablet = tablets[kv.first]; if (!tablet) { @@ -4810,14 +4860,16 @@ ::hybridse::sdk::Status SQLClusterRouter::RevertPut(const nameserver::TableInfo& if (!index.ts_name().empty()) { if (auto it = column_map.find(index.ts_name()); it == column_map.end()) { return {StatusCode::kCmdError, absl::StrCat("invalid ts name ", index.ts_name())}; - } else if (row_view.GetInteger(data, it->second, - table_info.column_desc(it->second).data_type(), &cur_ts) != 0) { + } else if (row_view.GetInteger(data, it->second, table_info.column_desc(it->second).data_type(), + &cur_ts) != 0) { return {StatusCode::kCmdError, "get ts failed"}; } } - std::map index_val = { {val.second, val.first} }; + std::map index_val = {{val.second, val.first}}; uint64_t end_ts = cur_ts > 0 ? cur_ts - 1 : 0; - client->Delete(table_info.tid(), kv.first, index_val, "", cur_ts, end_ts); + DeleteOption option(val.second, val.first, "", cur_ts, end_ts); + option.enable_decode_value = false; + client->Delete(table_info.tid(), kv.first, option, options_->request_timeout); } if (kv.first == end_pid) { break; diff --git a/src/sdk/sql_cluster_router.h b/src/sdk/sql_cluster_router.h index 502ad07dab6..1226ee4f987 100644 --- a/src/sdk/sql_cluster_router.h +++ b/src/sdk/sql_cluster_router.h @@ -49,6 +49,7 @@ class DeleteOption; using TableInfoMap = std::map>; class Bias; +struct UserInfo; class SQLClusterRouter : public SQLRouter { public: @@ -64,6 +65,8 @@ class SQLClusterRouter : public SQLRouter { bool Init(); + bool Auth(); + bool CreateDB(const std::string& db, hybridse::sdk::Status* status) override; bool DropDB(const std::string& db, hybridse::sdk::Status* status) override; @@ -87,7 +90,7 @@ class SQLClusterRouter : public SQLRouter { bool ExecuteInsert(const std::string& db, const std::string& name, int tid, int partition_num, hybridse::sdk::ByteArrayPtr dimension, int dimension_len, - hybridse::sdk::ByteArrayPtr value, int len, hybridse::sdk::Status* status) override; + hybridse::sdk::ByteArrayPtr value, int len, bool put_if_absent, hybridse::sdk::Status* status) override; bool ExecuteDelete(std::shared_ptr row, hybridse::sdk::Status* status) override; @@ -316,10 +319,11 @@ class SQLClusterRouter : public SQLRouter { bool GetInsertInfo(const std::string& db, const std::string& sql, ::hybridse::sdk::Status* status, std::shared_ptr<::openmldb::nameserver::TableInfo>* table_info, DefaultValueMap* default_map, - uint32_t* str_length, std::vector* stmt_column_idx_in_table); + uint32_t* str_length, std::vector* stmt_column_idx_in_table, bool* put_if_absent); bool GetMultiRowInsertInfo(const std::string& db, const std::string& sql, ::hybridse::sdk::Status* status, std::shared_ptr<::openmldb::nameserver::TableInfo>* table_info, - std::vector* default_maps, std::vector* str_lengths); + std::vector* default_maps, std::vector* str_lengths, + bool* put_if_absent); DefaultValueMap GetDefaultMap(const std::shared_ptr<::openmldb::nameserver::TableInfo>& table_info, const std::map& column_map, ::hybridse::node::ExprListNode* row, @@ -371,7 +375,7 @@ class SQLClusterRouter : public SQLRouter { const hybridse::node::ExprNode* condition); hybridse::sdk::Status SendDeleteRequst(const std::shared_ptr& table_info, - const DeleteOption* option); + const DeleteOption& option); hybridse::sdk::Status HandleIndex(const std::string& db, const std::set>& table_pair, @@ -423,6 +427,11 @@ class SQLClusterRouter : public SQLRouter { int64_t timeout_ms, const base::Slice& row, const std::string& router_col, hybridse::sdk::Status* status); + absl::StatusOr GetUser(const std::string& name, UserInfo* user_info); + hybridse::sdk::Status AddUser(const std::string& name, const std::string& password); + hybridse::sdk::Status UpdateUser(const UserInfo& user_info, const std::string& password); + hybridse::sdk::Status DeleteUser(const std::string& name); + void AddUserToConfig(std::map* config); ::hybridse::sdk::Status RevertPut(const nameserver::TableInfo& table_info, uint32_t end_pid, const std::map>>& dimensions, @@ -444,6 +453,14 @@ class SQLClusterRouter : public SQLRouter { std::atomic insert_memory_usage_limit_ = 0; // [0-100], the default value 0 means unlimited }; +struct UserInfo { + std::string name; + std::string password; + uint64_t create_time = 0; + uint64_t update_time = 0; + std::string privileges; +}; + class Bias { public: // If get failed, return false and won't change bias. Check negative bias value for your own logic diff --git a/src/sdk/sql_cluster_test.cc b/src/sdk/sql_cluster_test.cc index 9374841d71e..ee421a27ecb 100644 --- a/src/sdk/sql_cluster_test.cc +++ b/src/sdk/sql_cluster_test.cc @@ -121,13 +121,19 @@ TEST_F(SQLClusterDDLTest, TestShowAndDropDeployment) { router->ExecuteSQL(db, "deploy " + deploy_name + " select col1 from " + table_name + ";", &status); ASSERT_TRUE(status.IsOK()); - router->ExecuteSQL(db2, "deploy " + deploy_name + " select col1 from " + db + "." + table_name + ";", &status); + std::string sql = absl::StrCat("deploy ", deploy_name, + " OPTIONS(RANGE_BIAS=\"inf\", ROWS_BIAS=\"inf\") select col1 from ", db, ".", table_name, ";"); + router->ExecuteSQL(db2, sql, &status); ASSERT_TRUE(status.IsOK()); - router->ExecuteSQL(db, "show deployment " + deploy_name + ";", &status); + auto rs = router->ExecuteSQL(db, "show deployment " + deploy_name + ";", &status); ASSERT_TRUE(status.IsOK()); - router->ExecuteSQL(db, "show deployment " + db2 + "." + deploy_name + ";", &status); + ASSERT_TRUE(rs->Next()); + ASSERT_TRUE(rs->GetStringUnsafe(0).find("OPTIONS") == std::string::npos); + rs = router->ExecuteSQL(db, "show deployment " + db2 + "." + deploy_name + ";", &status); ASSERT_TRUE(status.IsOK()); + ASSERT_TRUE(rs->Next()); + ASSERT_TRUE(rs->GetStringUnsafe(0).find("OPTIONS(RANGE_BIAS=\"inf\", ROWS_BIAS=\"inf\")") != std::string::npos); router->ExecuteSQL(db, "drop deployment " + deploy_name + ";", &status); ASSERT_TRUE(status.IsOK()); @@ -323,47 +329,6 @@ TEST_F(SQLClusterDDLTest, CreateIndexCheck) { ASSERT_TRUE(router->DropDB(db, &status)); } -TEST_F(SQLClusterDDLTest, TestDelete) { - std::string name = "test" + GenRand(); - ::hybridse::sdk::Status status; - std::string ddl; - - std::string db = "db" + GenRand(); - ASSERT_TRUE(router->CreateDB(db, &status)); - ddl = absl::StrCat("create table ", name, - "(col1 string, col2 string, col3 string, col4 bigint, col5 bigint, col6 bigint, col7 string," - "index(key=col1, ts=col4), index(key=(col1, col2), ts=col4), index(key=col3, ts=col5));"); - ASSERT_TRUE(router->ExecuteDDL(db, ddl, &status)) << "ddl: " << ddl; - ASSERT_TRUE(router->RefreshCatalog()); - router->ExecuteSQL(db, "insert into " + name + " values ('a', 'aa', 'aaa', 100, 101, 102, 'xx');", &status); - router->ExecuteSQL(db, "insert into " + name + " values ('b', 'bb', 'bbb', 200, 201, 202, 'xx');", &status); - auto rs = router->ExecuteSQL(db, "select * from " + name + ";", &status); - ASSERT_EQ(rs->Size(), 2); - rs = router->ExecuteSQL(db, "delete from " + name + " where col1 = 'xxx' and col5 > 100;", &status); - ASSERT_FALSE(status.IsOK()); - rs = router->ExecuteSQL(db, "delete from " + name + " where col1 = 'xxx' and col6 > 100;", &status); - ASSERT_FALSE(status.IsOK()); - rs = router->ExecuteSQL(db, "delete from " + name + " where col1 = 'xxx' and col3 = 'aaa';", &status); - ASSERT_FALSE(status.IsOK()); - rs = router->ExecuteSQL(db, "delete from " + name + " where col7 = 'xxx' and col3 = 'aaa';", &status); - ASSERT_FALSE(status.IsOK()); - router->ExecuteSQL(db, "delete from " + name + " where col6 > 100;", &status); - ASSERT_FALSE(status.IsOK()); - router->ExecuteSQL(db, "delete from " + name + " where col4 > 100 and col5 = 200;", &status); - ASSERT_FALSE(status.IsOK()); - router->ExecuteSQL(db, "delete from " + name + " where col5 > 100;", &status); - ASSERT_TRUE(status.IsOK()) << status.msg; - rs = router->ExecuteSQL(db, "select * from " + name + ";", &status); - ASSERT_EQ(rs->Size(), 2); - router->ExecuteSQL(db, "delete from " + name + " where col4 > 100;", &status); - ASSERT_TRUE(status.IsOK()); - rs = router->ExecuteSQL(db, "select * from " + name + ";", &status); - ASSERT_EQ(rs->Size(), 1); - - ASSERT_TRUE(router->ExecuteDDL(db, "drop table " + name + ";", &status)); - ASSERT_TRUE(router->DropDB(db, &status)); -} - TEST_F(SQLClusterDDLTest, ColumnDefaultValue) { std::string name = "test" + GenRand(); ::hybridse::sdk::Status status; diff --git a/src/sdk/sql_insert_row.cc b/src/sdk/sql_insert_row.cc index a2d44571be2..492bb80e49b 100644 --- a/src/sdk/sql_insert_row.cc +++ b/src/sdk/sql_insert_row.cc @@ -29,33 +29,35 @@ namespace sdk { SQLInsertRows::SQLInsertRows(std::shared_ptr<::openmldb::nameserver::TableInfo> table_info, std::shared_ptr schema, DefaultValueMap default_map, - uint32_t default_str_length, const std::vector& hole_idx_arr) + uint32_t default_str_length, const std::vector& hole_idx_arr, bool put_if_absent) : table_info_(std::move(table_info)), schema_(std::move(schema)), default_map_(std::move(default_map)), default_str_length_(default_str_length), - hole_idx_arr_(hole_idx_arr) {} + hole_idx_arr_(hole_idx_arr), + put_if_absent_(put_if_absent) {} std::shared_ptr SQLInsertRows::NewRow() { if (!rows_.empty() && !rows_.back()->IsComplete()) { return {}; } - std::shared_ptr row = - std::make_shared(table_info_, schema_, default_map_, default_str_length_, hole_idx_arr_); + std::shared_ptr row = std::make_shared( + table_info_, schema_, default_map_, default_str_length_, hole_idx_arr_, put_if_absent_); rows_.push_back(row); return row; } SQLInsertRow::SQLInsertRow(std::shared_ptr<::openmldb::nameserver::TableInfo> table_info, std::shared_ptr schema, DefaultValueMap default_map, - uint32_t default_string_length) + uint32_t default_string_length, bool put_if_absent) : table_info_(table_info), schema_(std::move(schema)), default_map_(std::move(default_map)), default_string_length_(default_string_length), rb_(table_info->column_desc()), val_(), - str_size_(0) { + str_size_(0), + put_if_absent_(put_if_absent) { std::map column_name_map; for (int idx = 0; idx < table_info_->column_desc_size(); idx++) { column_name_map.emplace(table_info_->column_desc(idx).name(), idx); @@ -81,8 +83,9 @@ SQLInsertRow::SQLInsertRow(std::shared_ptr<::openmldb::nameserver::TableInfo> ta SQLInsertRow::SQLInsertRow(std::shared_ptr<::openmldb::nameserver::TableInfo> table_info, std::shared_ptr schema, DefaultValueMap default_map, - uint32_t default_str_length, std::vector hole_idx_arr) - : SQLInsertRow(std::move(table_info), std::move(schema), std::move(default_map), default_str_length) { + uint32_t default_str_length, std::vector hole_idx_arr, bool put_if_absent) + : SQLInsertRow(std::move(table_info), std::move(schema), std::move(default_map), default_str_length, + put_if_absent) { hole_idx_arr_ = std::move(hole_idx_arr); } diff --git a/src/sdk/sql_insert_row.h b/src/sdk/sql_insert_row.h index ded1c824e19..af18891587f 100644 --- a/src/sdk/sql_insert_row.h +++ b/src/sdk/sql_insert_row.h @@ -103,12 +103,13 @@ class DefaultValueContainer { class SQLInsertRow { public: + // for raw insert sql(no hole) SQLInsertRow(std::shared_ptr<::openmldb::nameserver::TableInfo> table_info, std::shared_ptr schema, DefaultValueMap default_map, - uint32_t default_str_length); + uint32_t default_str_length, bool put_if_absent); SQLInsertRow(std::shared_ptr<::openmldb::nameserver::TableInfo> table_info, std::shared_ptr schema, DefaultValueMap default_map, - uint32_t default_str_length, std::vector hole_idx_arr); + uint32_t default_str_length, std::vector hole_idx_arr, bool put_if_absent); ~SQLInsertRow() = default; bool Init(int str_length); bool AppendBool(bool val); @@ -155,6 +156,10 @@ class SQLInsertRow { return *table_info_; } + bool IsPutIfAbsent() const { + return put_if_absent_; + } + private: bool MakeDefault(); void PackDimension(const std::string& val); @@ -175,13 +180,14 @@ class SQLInsertRow { ::openmldb::codec::RowBuilder rb_; std::string val_; uint32_t str_size_; + bool put_if_absent_; }; class SQLInsertRows { public: SQLInsertRows(std::shared_ptr<::openmldb::nameserver::TableInfo> table_info, std::shared_ptr schema, DefaultValueMap default_map, uint32_t str_size, - const std::vector& hole_idx_arr); + const std::vector& hole_idx_arr, bool put_if_absent); ~SQLInsertRows() = default; std::shared_ptr NewRow(); inline uint32_t GetCnt() { return rows_.size(); } @@ -200,6 +206,7 @@ class SQLInsertRows { DefaultValueMap default_map_; uint32_t default_str_length_; std::vector hole_idx_arr_; + bool put_if_absent_; std::vector> rows_; }; diff --git a/src/sdk/sql_router.h b/src/sdk/sql_router.h index 4317d435f8c..f68d7d39a1c 100644 --- a/src/sdk/sql_router.h +++ b/src/sdk/sql_router.h @@ -27,6 +27,7 @@ #include #include "sdk/base.h" +#include "sdk/options.h" #include "sdk/result_set.h" #include "sdk/sql_delete_row.h" #include "sdk/sql_insert_row.h" @@ -39,34 +40,6 @@ namespace sdk { typedef char* ByteArrayPtr; -struct BasicRouterOptions { - virtual ~BasicRouterOptions() = default; - bool enable_debug = false; - uint32_t max_sql_cache_size = 50; - // == gflag `request_timeout` default value(no gflags here cuz swig) - uint32_t request_timeout = 60000; - // default 0(INFO), INFO, WARNING, ERROR, and FATAL are 0, 1, 2, and 3 - int glog_level = 0; - // empty means to stderr - std::string glog_dir = ""; -}; - -struct SQLRouterOptions : BasicRouterOptions { - std::string zk_cluster; - std::string zk_path; - uint32_t zk_session_timeout = 2000; - std::string spark_conf_path; - uint32_t zk_log_level = 3; // PY/JAVA SDK default info log - std::string zk_log_file; - std::string zk_auth_schema = "digest"; - std::string zk_cert; -}; - -struct StandaloneOptions : BasicRouterOptions { - std::string host; - uint32_t port; -}; - class ExplainInfo { public: ExplainInfo() {} @@ -130,7 +103,7 @@ class SQLRouter { virtual bool ExecuteInsert(const std::string& db, const std::string& name, int tid, int partition_num, hybridse::sdk::ByteArrayPtr dimension, int dimension_len, - hybridse::sdk::ByteArrayPtr value, int len, hybridse::sdk::Status* status) = 0; + hybridse::sdk::ByteArrayPtr value, int len, bool put_if_absent, hybridse::sdk::Status* status) = 0; virtual bool ExecuteDelete(std::shared_ptr row, hybridse::sdk::Status* status) = 0; diff --git a/src/sdk/sql_router_sdk.i b/src/sdk/sql_router_sdk.i index 07bb3d5741b..15ea2b8e7c4 100644 --- a/src/sdk/sql_router_sdk.i +++ b/src/sdk/sql_router_sdk.i @@ -71,6 +71,7 @@ %shared_ptr(openmldb::sdk::DAGNode); %{ +#include "sdk/options.h" #include "sdk/sql_router.h" #include "sdk/result_set.h" #include "sdk/base_schema.h" @@ -98,6 +99,7 @@ using openmldb::sdk::TableReader; using openmldb::sdk::DefaultValueContainer; %} +%include "sdk/options.h" %include "sdk/sql_router.h" %include "sdk/base_schema.h" %include "sdk/base.h" diff --git a/src/sdk/sql_standalone_sdk_test.cc b/src/sdk/sql_standalone_sdk_test.cc index e61cf1ea76c..2b2a4cc2be8 100644 --- a/src/sdk/sql_standalone_sdk_test.cc +++ b/src/sdk/sql_standalone_sdk_test.cc @@ -882,7 +882,8 @@ int main(int argc, char** argv) { ::openmldb::sdk::StandaloneEnv env; env.SetUp(); // connect to nameserver - ::openmldb::sdk::DBSDK *cs = new ::openmldb::sdk::StandAloneSDK("127.0.0.1", env.GetNsPort()); + auto sopt = std::make_shared<::openmldb::sdk::StandaloneOptions>("127.0.0.1", env.GetNsPort()); + ::openmldb::sdk::DBSDK *cs = new ::openmldb::sdk::StandAloneSDK(sopt); bool ok = cs->Init(); if (!ok) { std::cout << "Fail to connect to db" << std::endl; diff --git a/src/storage/aggregator.cc b/src/storage/aggregator.cc index 7814c687be5..4615c87bc20 100644 --- a/src/storage/aggregator.cc +++ b/src/storage/aggregator.cc @@ -641,9 +641,9 @@ bool Aggregator::FlushAggrBuffer(const std::string& key, const std::string& filt auto dimension = entry.add_dimensions(); dimension->set_idx(aggr_index_pos_); dimension->set_key(key); - bool ok = aggr_table_->Put(time, entry.value(), entry.dimensions()); - if (!ok) { - PDLOG(ERROR, "Aggregator put failed"); + auto st = aggr_table_->Put(time, entry.value(), entry.dimensions()); + if (!st.ok()) { + LOG(ERROR) << "Aggregator put failed: " << st.ToString(); return false; } entry.set_pk(key); diff --git a/src/storage/disk_table.cc b/src/storage/disk_table.cc index b41c9f8fd3c..af35ab9a170 100644 --- a/src/storage/disk_table.cc +++ b/src/storage/disk_table.cc @@ -227,7 +227,8 @@ bool DiskTable::Put(const std::string& pk, uint64_t time, const char* data, uint } } -bool DiskTable::Put(uint64_t time, const std::string& value, const Dimensions& dimensions) { +absl::Status DiskTable::Put(uint64_t time, const std::string& value, const Dimensions& dimensions, bool put_if_absent) { + // disk table will update if key-time is the same, so no need to handle put_if_absent const int8_t* data = reinterpret_cast(value.data()); std::string uncompress_data; if (GetCompressType() == openmldb::type::kSnappy) { @@ -237,15 +238,14 @@ bool DiskTable::Put(uint64_t time, const std::string& value, const Dimensions& d uint8_t version = codec::RowView::GetSchemaVersion(data); auto decoder = GetVersionDecoder(version); if (decoder == nullptr) { - PDLOG(WARNING, "invalid schema version %u, tid %u pid %u", version, id_, pid_); - return false; + return absl::InvalidArgumentError(absl::StrCat(id_, ".", pid_, ": invalid schema version ", version)); } rocksdb::WriteBatch batch; for (auto it = dimensions.begin(); it != dimensions.end(); ++it) { auto index_def = table_index_.GetIndex(it->idx()); if (!index_def || !index_def->IsReady()) { - PDLOG(WARNING, "failed putting key %s to dimension %u in table tid %u pid %u", it->key().c_str(), - it->idx(), id_, pid_); + PDLOG(WARNING, "failed putting key %s to dimension %u in table tid %u pid %u", it->key().c_str(), it->idx(), + id_, pid_); } int32_t inner_pos = table_index_.GetInnerIndexPos(it->idx()); auto inner_index = table_index_.GetInnerIndex(inner_pos); @@ -256,12 +256,10 @@ bool DiskTable::Put(uint64_t time, const std::string& value, const Dimensions& d if (ts_col->IsAutoGenTs()) { ts = time; } else if (decoder->GetInteger(data, ts_col->GetId(), ts_col->GetType(), &ts) != 0) { - PDLOG(WARNING, "get ts failed. tid %u pid %u", id_, pid_); - return false; + return absl::InvalidArgumentError(absl::StrCat(id_, ".", pid_, ": get ts failed")); } if (ts < 0) { - PDLOG(WARNING, "ts %ld is negative. tid %u pid %u", ts, id_, pid_); - return false; + return absl::InvalidArgumentError(absl::StrCat(id_, ".", pid_, ": ts is negative ", ts)); } if (inner_index->GetIndex().size() > 1) { combine_key = CombineKeyTs(it->key(), ts, ts_col->GetId()); @@ -275,10 +273,9 @@ bool DiskTable::Put(uint64_t time, const std::string& value, const Dimensions& d auto s = db_->Write(write_opts_, &batch); if (s.ok()) { offset_.fetch_add(1, std::memory_order_relaxed); - return true; + return absl::OkStatus(); } else { - DEBUGLOG("Put failed. tid %u pid %u msg %s", id_, pid_, s.ToString().c_str()); - return false; + return absl::InternalError(absl::StrCat(id_, ".", pid_, ": ", s.ToString())); } } diff --git a/src/storage/disk_table.h b/src/storage/disk_table.h index be549d0c2cd..9e207dc831d 100644 --- a/src/storage/disk_table.h +++ b/src/storage/disk_table.h @@ -21,6 +21,7 @@ #include #include #include + #include "base/slice.h" #include "base/status.h" #include "common/timer.h" @@ -102,7 +103,7 @@ class AbsoluteTTLCompactionFilter : public rocksdb::CompactionFilter { return false; } uint32_t ts_idx = *((uint32_t*)(key.data() + key.size() - TS_LEN - // NOLINT - TS_POS_LEN)); + TS_POS_LEN)); bool has_found = false; for (const auto& index : indexs) { auto ts_col = index->GetTsColumn(); @@ -110,7 +111,7 @@ class AbsoluteTTLCompactionFilter : public rocksdb::CompactionFilter { return false; } if (ts_col->GetId() == ts_idx && - index->GetTTL()->ttl_type == openmldb::storage::TTLType::kAbsoluteTime) { + index->GetTTL()->ttl_type == openmldb::storage::TTLType::kAbsoluteTime) { real_ttl = index->GetTTL()->abs_ttl; has_found = true; break; @@ -172,7 +173,8 @@ class DiskTable : public Table { bool Put(const std::string& pk, uint64_t time, const char* data, uint32_t size) override; - bool Put(uint64_t time, const std::string& value, const Dimensions& dimensions) override; + absl::Status Put(uint64_t time, const std::string& value, const Dimensions& dimensions, + bool put_if_absent) override; bool Get(uint32_t idx, const std::string& pk, uint64_t ts, std::string& value); // NOLINT @@ -183,9 +185,6 @@ class DiskTable : public Table { base::Status Truncate(); - bool Delete(uint32_t idx, const std::string& pk, - const std::optional& start_ts, const std::optional& end_ts) override; - uint64_t GetExpireTime(const TTLSt& ttl_st) override; uint64_t GetRecordCnt() override { @@ -233,11 +232,15 @@ class DiskTable : public Table { uint64_t GetRecordByteSize() const override { return 0; } uint64_t GetRecordIdxByteSize() override; - int GetCount(uint32_t index, const std::string& pk, uint64_t& count) override; // NOLINT + int GetCount(uint32_t index, const std::string& pk, uint64_t& count) override; // NOLINT private: base::Status Delete(uint32_t idx, const std::string& pk, uint64_t start_ts, const std::optional& end_ts); + bool Delete(uint32_t idx, const std::string& pk, + const std::optional& start_ts, const std::optional& end_ts) override; + + private: rocksdb::DB* db_; rocksdb::WriteOptions write_opts_; diff --git a/src/storage/disk_table_test.cc b/src/storage/disk_table_test.cc index 2a4e0d53c98..04a5d6edbb3 100644 --- a/src/storage/disk_table_test.cc +++ b/src/storage/disk_table_test.cc @@ -111,7 +111,7 @@ TEST_F(DiskTableTest, MultiDimensionPut) { mapping.insert(std::make_pair("idx1", 1)); mapping.insert(std::make_pair("idx2", 2)); std::string table_path = FLAGS_hdd_root_path + "/2_1"; - DiskTable* table = new DiskTable("yjtable2", 2, 1, mapping, 10, ::openmldb::type::TTLType::kAbsoluteTime, + Table* table = new DiskTable("yjtable2", 2, 1, mapping, 10, ::openmldb::type::TTLType::kAbsoluteTime, ::openmldb::common::StorageMode::kHDD, table_path); ASSERT_TRUE(table->Init()); ASSERT_EQ(3, (int64_t)table->GetIdxCnt()); @@ -136,7 +136,7 @@ TEST_F(DiskTableTest, MultiDimensionPut) { d2->set_idx(2); std::string value; ASSERT_EQ(0, sdk_codec.EncodeRow(row, &value)); - bool ok = table->Put(1, value, dimensions); + bool ok = table->Put(1, value, dimensions).ok(); ASSERT_TRUE(ok); // some functions in disk table need to be implemented. // refer to issue #1238 @@ -202,7 +202,7 @@ TEST_F(DiskTableTest, MultiDimensionPut) { row = {"valuea", "valueb", "valuec"}; ASSERT_EQ(0, sdk_codec.EncodeRow(row, &value)); - ASSERT_TRUE(table->Put(2, value, dimensions)); + ASSERT_TRUE(table->Put(2, value, dimensions).ok()); it = table->NewIterator(0, "key2", ticket); it->SeekToFirst(); @@ -223,7 +223,7 @@ TEST_F(DiskTableTest, MultiDimensionPut) { delete it; std::string val; - ASSERT_TRUE(table->Get(1, "key1", 2, val)); + ASSERT_TRUE(reinterpret_cast(table)->Get(1, "key1", 2, val)); data = reinterpret_cast(val.data()); version = codec::RowView::GetSchemaVersion(data); decoder = table->GetVersionDecoder(version); @@ -277,7 +277,7 @@ TEST_F(DiskTableTest, LongPut) { mapping.insert(std::make_pair("idx0", 0)); mapping.insert(std::make_pair("idx1", 1)); std::string table_path = FLAGS_ssd_root_path + "/3_1"; - DiskTable* table = new DiskTable("yjtable3", 3, 1, mapping, 10, ::openmldb::type::TTLType::kAbsoluteTime, + Table* table = new DiskTable("yjtable3", 3, 1, mapping, 10, ::openmldb::type::TTLType::kAbsoluteTime, ::openmldb::common::StorageMode::kSSD, table_path); auto meta = ::openmldb::test::GetTableMeta({"idx0", "idx1"}); ::openmldb::codec::SDKCodec sdk_codec(meta); @@ -297,7 +297,7 @@ TEST_F(DiskTableTest, LongPut) { std::string value; ASSERT_EQ(0, sdk_codec.EncodeRow(row, &value)); for (int k = 0; k < 10; k++) { - ASSERT_TRUE(table->Put(ts + k, value, dimensions)); + ASSERT_TRUE(table->Put(ts + k, value, dimensions).ok()); } } for (int idx = 0; idx < 10; idx++) { @@ -465,7 +465,7 @@ TEST_F(DiskTableTest, TraverseIterator) { } ASSERT_EQ(20, count); std::string val; - ASSERT_TRUE(table->Get(0, "test98", 9548, val)); + ASSERT_TRUE(reinterpret_cast(table)->Get(0, "test98", 9548, val)); ASSERT_EQ("valu8", val); delete it; delete table; @@ -733,7 +733,7 @@ TEST_F(DiskTableTest, CompactFilter) { std::map mapping; mapping.insert(std::make_pair("idx0", 0)); std::string table_path = FLAGS_hdd_root_path + "/10_1"; - DiskTable* table = new DiskTable("t1", 10, 1, mapping, 10, ::openmldb::type::TTLType::kAbsoluteTime, + Table* table = new DiskTable("t1", 10, 1, mapping, 10, ::openmldb::type::TTLType::kAbsoluteTime, ::openmldb::common::StorageMode::kHDD, table_path); ASSERT_TRUE(table->Init()); uint64_t cur_time = ::baidu::common::timer::get_micros() / 1000; @@ -754,24 +754,24 @@ TEST_F(DiskTableTest, CompactFilter) { for (int k = 0; k < 5; k++) { std::string value; if (k > 2) { - ASSERT_TRUE(table->Get(key, ts - k - 10 * 60 * 1000, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(key, ts - k - 10 * 60 * 1000, value)); ASSERT_EQ("value9", value); } else { - ASSERT_TRUE(table->Get(key, ts - k, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(key, ts - k, value)); ASSERT_EQ("value", value); } } } - table->CompactDB(); + reinterpret_cast(table)->CompactDB(); for (int idx = 0; idx < 100; idx++) { std::string key = "test" + std::to_string(idx); uint64_t ts = cur_time; for (int k = 0; k < 5; k++) { std::string value; if (k > 2) { - ASSERT_FALSE(table->Get(key, ts - k - 10 * 60 * 1000, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(key, ts - k - 10 * 60 * 1000, value)); } else { - ASSERT_TRUE(table->Get(key, ts - k, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(key, ts - k, value)); ASSERT_EQ("value", value); } } @@ -794,7 +794,7 @@ TEST_F(DiskTableTest, CompactFilterMulTs) { SchemaCodec::SetIndex(table_meta.add_column_key(), "mcc", "mcc", "ts2", ::openmldb::type::kAbsoluteTime, 5, 0); std::string table_path = FLAGS_hdd_root_path + "/11_1"; - DiskTable* table = new DiskTable(table_meta, table_path); + Table* table = new DiskTable(table_meta, table_path); ASSERT_TRUE(table->Init()); codec::SDKCodec codec(table_meta); @@ -818,7 +818,7 @@ TEST_F(DiskTableTest, CompactFilterMulTs) { std::to_string(cur_time - i * 60 * 1000)}; std::string value; ASSERT_EQ(0, codec.EncodeRow(row, &value)); - ASSERT_TRUE(table->Put(cur_time - i * 60 * 1000, value, dims)); + ASSERT_TRUE(table->Put(cur_time - i * 60 * 1000, value, dims).ok()); } } else { @@ -828,7 +828,7 @@ TEST_F(DiskTableTest, CompactFilterMulTs) { std::to_string(cur_time - i)}; std::string value; ASSERT_EQ(0, codec.EncodeRow(row, &value)); - ASSERT_TRUE(table->Put(cur_time - i, value, dims)); + ASSERT_TRUE(table->Put(cur_time - i, value, dims).ok()); } } } @@ -860,11 +860,11 @@ TEST_F(DiskTableTest, CompactFilterMulTs) { std::string e_value; ASSERT_EQ(0, codec.EncodeRow(row, &e_value)); std::string value; - ASSERT_TRUE(table->Get(0, key, ts - i * 60 * 1000, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(0, key, ts - i * 60 * 1000, value)); ASSERT_EQ(e_value, value); - ASSERT_TRUE(table->Get(1, key, ts - i * 60 * 1000, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(1, key, ts - i * 60 * 1000, value)); ASSERT_EQ(e_value, value); - ASSERT_TRUE(table->Get(2, key1, ts - i * 60 * 1000, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(2, key1, ts - i * 60 * 1000, value)); } } else { @@ -874,15 +874,15 @@ TEST_F(DiskTableTest, CompactFilterMulTs) { std::string e_value; ASSERT_EQ(0, codec.EncodeRow(row, &e_value)); std::string value; - ASSERT_TRUE(table->Get(0, key, ts - i, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(0, key, ts - i, value)); ASSERT_EQ(e_value, value); - ASSERT_TRUE(table->Get(1, key, ts - i, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(1, key, ts - i, value)); ASSERT_EQ(e_value, value); - ASSERT_TRUE(table->Get(2, key1, ts - i, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(2, key1, ts - i, value)); } } } - table->CompactDB(); + reinterpret_cast(table)->CompactDB(); iter = table->NewIterator(0, "card0", ticket); iter->SeekToFirst(); while (iter->Valid()) { @@ -908,18 +908,18 @@ TEST_F(DiskTableTest, CompactFilterMulTs) { ASSERT_EQ(0, codec.EncodeRow(row, &e_value)); std::string value; if (i < 3) { - ASSERT_TRUE(table->Get(0, key, cur_ts, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(0, key, cur_ts, value)); ASSERT_EQ(e_value, value); } else { - ASSERT_FALSE(table->Get(0, key, cur_ts, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(0, key, cur_ts, value)); } if (i < 5) { - ASSERT_TRUE(table->Get(1, key, cur_ts, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(1, key, cur_ts, value)); ASSERT_EQ(e_value, value); - ASSERT_TRUE(table->Get(2, key1, cur_ts, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(2, key1, cur_ts, value)); } else { - ASSERT_FALSE(table->Get(1, key, cur_ts, value)); - ASSERT_FALSE(table->Get(2, key1, cur_ts, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(1, key, cur_ts, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(2, key1, cur_ts, value)); } } } else { @@ -929,11 +929,11 @@ TEST_F(DiskTableTest, CompactFilterMulTs) { std::string e_value; ASSERT_EQ(0, codec.EncodeRow(row, &e_value)); std::string value; - ASSERT_TRUE(table->Get(0, key, ts - i, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(0, key, ts - i, value)); ASSERT_EQ(e_value, value); - ASSERT_TRUE(table->Get(1, key, ts - i, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(1, key, ts - i, value)); ASSERT_EQ(e_value, value); - ASSERT_TRUE(table->Get(2, key1, ts - i, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(2, key1, ts - i, value)); } } } @@ -955,7 +955,8 @@ TEST_F(DiskTableTest, GcHeadMulTs) { SchemaCodec::SetIndex(table_meta.add_column_key(), "mcc", "mcc", "ts2", ::openmldb::type::kLatestTime, 0, 5); std::string table_path = FLAGS_hdd_root_path + "/12_1"; - DiskTable* table = new DiskTable(table_meta, table_path); + // Table base class doesn't have Get method, cast to DiskTable to call Get + Table* table = new DiskTable(table_meta, table_path); ASSERT_TRUE(table->Init()); codec::SDKCodec codec(table_meta); @@ -980,7 +981,7 @@ TEST_F(DiskTableTest, GcHeadMulTs) { std::to_string(cur_time - i), std::to_string(cur_time - i)}; std::string value; ASSERT_EQ(0, codec.EncodeRow(row, &value)); - ASSERT_TRUE(table->Put(cur_time - i, value, dims)); + ASSERT_TRUE(table->Put(cur_time - i, value, dims).ok()); } } Ticket ticket; @@ -1006,15 +1007,15 @@ TEST_F(DiskTableTest, GcHeadMulTs) { ASSERT_EQ(0, codec.EncodeRow(row, &e_value)); std::string value; if (idx == 50 && i > 2) { - ASSERT_FALSE(table->Get(0, key, cur_time - i, value)); - ASSERT_FALSE(table->Get(1, key, cur_time - i, value)); - ASSERT_FALSE(table->Get(2, key1, cur_time - i, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(0, key, cur_time - i, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(1, key, cur_time - i, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(2, key1, cur_time - i, value)); } else { - ASSERT_TRUE(table->Get(0, key, cur_time - i, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(0, key, cur_time - i, value)); ASSERT_EQ(e_value, value); - ASSERT_TRUE(table->Get(1, key, cur_time - i, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(1, key, cur_time - i, value)); ASSERT_EQ(e_value, value); - ASSERT_TRUE(table->Get(2, key1, cur_time - i, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(2, key1, cur_time - i, value)); } } } @@ -1041,24 +1042,24 @@ TEST_F(DiskTableTest, GcHeadMulTs) { ASSERT_EQ(0, codec.EncodeRow(row, &e_value)); std::string value; if (idx == 50 && i > 2) { - ASSERT_FALSE(table->Get(0, key, cur_time - i, value)); - ASSERT_FALSE(table->Get(1, key, cur_time - i, value)); - ASSERT_FALSE(table->Get(2, key1, cur_time - i, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(0, key, cur_time - i, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(1, key, cur_time - i, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(2, key1, cur_time - i, value)); } else if (i < 3) { - ASSERT_TRUE(table->Get(0, key, cur_time - i, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(0, key, cur_time - i, value)); ASSERT_EQ(e_value, value); - ASSERT_TRUE(table->Get(1, key, cur_time - i, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(1, key, cur_time - i, value)); ASSERT_EQ(e_value, value); - ASSERT_TRUE(table->Get(2, key1, cur_time - i, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(2, key1, cur_time - i, value)); } else if (i < 5) { - ASSERT_FALSE(table->Get(0, key, cur_time - i, value)); - ASSERT_TRUE(table->Get(1, key, cur_time - i, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(0, key, cur_time - i, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(1, key, cur_time - i, value)); ASSERT_EQ(e_value, value); - ASSERT_TRUE(table->Get(2, key1, cur_time - i, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(2, key1, cur_time - i, value)); } else { - ASSERT_FALSE(table->Get(0, key, cur_time - i, value)); - ASSERT_FALSE(table->Get(1, key, cur_time - i, value)); - ASSERT_FALSE(table->Get(2, key1, cur_time - i, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(0, key, cur_time - i, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(1, key, cur_time - i, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(2, key1, cur_time - i, value)); } } } @@ -1089,7 +1090,7 @@ TEST_F(DiskTableTest, GcHead) { uint64_t ts = 9537; for (int k = 0; k < 5; k++) { std::string value; - ASSERT_TRUE(table->Get(key, ts + k, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(key, ts + k, value)); if (idx == 10 && k == 2) { ASSERT_EQ("value8", value); } else { @@ -1104,9 +1105,9 @@ TEST_F(DiskTableTest, GcHead) { for (int k = 0; k < 5; k++) { std::string value; if (k < 2) { - ASSERT_FALSE(table->Get(key, ts + k, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(key, ts + k, value)); } else { - ASSERT_TRUE(table->Get(key, ts + k, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(key, ts + k, value)); if (idx == 10 && k == 2) { ASSERT_EQ("value8", value); } else { diff --git a/src/storage/key_entry.h b/src/storage/key_entry.h index e8969c9b832..1b5f4778f4f 100644 --- a/src/storage/key_entry.h +++ b/src/storage/key_entry.h @@ -49,11 +49,19 @@ struct DataBlock { delete[] data; data = nullptr; } + + bool EqualWithoutCnt(const DataBlock& other) const { + if (size != other.size) { + return false; + } + // you can improve it ref RowBuilder::InitBuffer header version + return memcmp(data, other.data, size) == 0; + } }; // the desc time comparator struct TimeComparator { - int operator() (uint64_t a, uint64_t b) const { + int operator()(uint64_t a, uint64_t b) const { if (a > b) { return -1; } else if (a == b) { @@ -86,7 +94,6 @@ class KeyEntry { std::atomic count_; }; - } // namespace storage } // namespace openmldb diff --git a/src/storage/mem_table.cc b/src/storage/mem_table.cc index db7578619b7..91148eef09b 100644 --- a/src/storage/mem_table.cc +++ b/src/storage/mem_table.cc @@ -140,43 +140,43 @@ bool MemTable::Put(const std::string& pk, uint64_t time, const char* data, uint3 return true; } -bool MemTable::Put(uint64_t time, const std::string& value, const Dimensions& dimensions) { +absl::Status MemTable::Put(uint64_t time, const std::string& value, const Dimensions& dimensions, bool put_if_absent) { if (dimensions.empty()) { PDLOG(WARNING, "empty dimension. tid %u pid %u", id_, pid_); - return false; - } - if (value.length() < codec::HEADER_LENGTH) { - PDLOG(WARNING, "invalid value. tid %u pid %u", id_, pid_); - return false; + return absl::InvalidArgumentError(absl::StrCat(id_, ".", pid_, ": empty dimension")); } + // inner index pos: -1 means invalid, so it's positive in inner_index_key_map std::map inner_index_key_map; for (auto iter = dimensions.begin(); iter != dimensions.end(); iter++) { int32_t inner_pos = table_index_.GetInnerIndexPos(iter->idx()); if (inner_pos < 0) { - PDLOG(WARNING, "invalid dimension. dimension idx %u, tid %u pid %u", iter->idx(), id_, pid_); - return false; + return absl::InvalidArgumentError(absl::StrCat(id_, ".", pid_, ": invalid dimension idx ", iter->idx())); } inner_index_key_map.emplace(inner_pos, iter->key()); } uint32_t real_ref_cnt = 0; const int8_t* data = reinterpret_cast(value.data()); std::string uncompress_data; + uint32_t data_length = value.length(); if (GetCompressType() == openmldb::type::kSnappy) { snappy::Uncompress(value.data(), value.size(), &uncompress_data); data = reinterpret_cast(uncompress_data.data()); + data_length = uncompress_data.length(); + } + if (data_length < codec::HEADER_LENGTH) { + PDLOG(WARNING, "invalid value. tid %u pid %u", id_, pid_); + return absl::InvalidArgumentError(absl::StrCat(id_, ".", pid_, ": invalid value")); } uint8_t version = codec::RowView::GetSchemaVersion(data); auto decoder = GetVersionDecoder(version); if (decoder == nullptr) { - PDLOG(WARNING, "invalid schema version %u, tid %u pid %u", version, id_, pid_); - return false; + return absl::InvalidArgumentError(absl::StrCat(id_, ".", pid_, ": invalid schema version ", version)); } std::map> ts_value_map; for (const auto& kv : inner_index_key_map) { auto inner_index = table_index_.GetInnerIndex(kv.first); if (!inner_index) { - PDLOG(WARNING, "invalid inner index pos %d. tid %u pid %u", kv.first, id_, pid_); - return false; + return absl::InvalidArgumentError(absl::StrCat(id_, ".", pid_, ": invalid inner index pos ", kv.first)); } std::map ts_map; for (const auto& index_def : inner_index->GetIndex()) { @@ -189,13 +189,12 @@ bool MemTable::Put(uint64_t time, const std::string& value, const Dimensions& di if (ts_col->IsAutoGenTs()) { ts = time; } else if (decoder->GetInteger(data, ts_col->GetId(), ts_col->GetType(), &ts) != 0) { - PDLOG(WARNING, "get ts failed. tid %u pid %u", id_, pid_); - return false; + return absl::InvalidArgumentError(absl::StrCat(id_, ".", pid_, ": get ts failed")); } if (ts < 0) { - PDLOG(WARNING, "ts %ld is negative. tid %u pid %u", ts, id_, pid_); - return false; + return absl::InvalidArgumentError(absl::StrCat(id_, ".", pid_, ": ts is negative ", ts)); } + // TODO(hw): why uint32_t to int32_t? ts_map.emplace(ts_col->GetId(), ts); real_ref_cnt++; } @@ -205,7 +204,7 @@ bool MemTable::Put(uint64_t time, const std::string& value, const Dimensions& di } } if (ts_value_map.empty()) { - return false; + return absl::InvalidArgumentError(absl::StrCat(id_, ".", pid_, ": empty ts value map")); } auto* block = new DataBlock(real_ref_cnt, value.c_str(), value.length()); for (const auto& kv : inner_index_key_map) { @@ -218,10 +217,12 @@ bool MemTable::Put(uint64_t time, const std::string& value, const Dimensions& di seg_idx = ::openmldb::base::hash(kv.second.data(), kv.second.size(), SEED) % seg_cnt_; } Segment* segment = segments_[kv.first][seg_idx]; - segment->Put(::openmldb::base::Slice(kv.second), iter->second, block); + if (!segment->Put(kv.second, iter->second, block, put_if_absent)) { + return absl::AlreadyExistsError("data exists"); // let caller know exists + } } record_byte_size_.fetch_add(GetRecordSize(value.length())); - return true; + return absl::OkStatus(); } bool MemTable::Delete(const ::openmldb::api::LogEntry& entry) { @@ -550,6 +551,7 @@ uint64_t MemTable::GetRecordIdxCnt() { if (!index_def || !index_def->IsReady()) { return record_idx_cnt; } + uint32_t inner_idx = index_def->GetInnerPos(); auto inner_index = table_index_.GetInnerIndex(inner_idx); int32_t ts_col_id = -1; @@ -665,7 +667,7 @@ bool MemTable::AddIndex(const ::openmldb::common::ColumnKey& column_key) { } ts_vec.push_back(ts_iter->second.GetId()); } else { - ts_vec.push_back(DEFUALT_TS_COL_ID); + ts_vec.push_back(DEFAULT_TS_COL_ID); } uint32_t inner_id = table_index_.GetAllInnerIndex()->size(); Segment** seg_arr = new Segment*[seg_cnt_]; @@ -685,7 +687,7 @@ bool MemTable::AddIndex(const ::openmldb::common::ColumnKey& column_key) { auto ts_iter = schema.find(column_key.ts_name()); index_def->SetTsColumn(std::make_shared(ts_iter->second)); } else { - index_def->SetTsColumn(std::make_shared(DEFUALT_TS_COL_NAME, DEFUALT_TS_COL_ID, + index_def->SetTsColumn(std::make_shared(DEFAULT_TS_COL_NAME, DEFAULT_TS_COL_ID, ::openmldb::type::kTimestamp, true)); } if (column_key.has_ttl()) { @@ -724,14 +726,14 @@ bool MemTable::DeleteIndex(const std::string& idx_name) { new_table_meta->mutable_column_key(index_def->GetId())->set_flag(1); } std::atomic_store_explicit(&table_meta_, new_table_meta, std::memory_order_release); - index_def->SetStatus(IndexStatus::kWaiting); + index_def->SetStatus(IndexStatus::kWaiting); // let gc do deletion return true; } ::hybridse::vm::WindowIterator* MemTable::NewWindowIterator(uint32_t index) { std::shared_ptr index_def = table_index_.GetIndex(index); if (!index_def || !index_def->IsReady()) { - LOG(WARNING) << "index id " << index << " not found. tid " << id_ << " pid " << pid_; + LOG(WARNING) << "index id " << index << " not found. tid " << id_ << " pid " << pid_; return nullptr; } uint64_t expire_time = 0; diff --git a/src/storage/mem_table.h b/src/storage/mem_table.h index 8ae1964e0ef..c84863eb1c7 100644 --- a/src/storage/mem_table.h +++ b/src/storage/mem_table.h @@ -51,7 +51,8 @@ class MemTable : public Table { bool Put(const std::string& pk, uint64_t time, const char* data, uint32_t size) override; - bool Put(uint64_t time, const std::string& value, const Dimensions& dimensions) override; + absl::Status Put(uint64_t time, const std::string& value, const Dimensions& dimensions, + bool put_if_absent) override; bool GetBulkLoadInfo(::openmldb::api::BulkLoadInfoResponse* response); @@ -59,8 +60,6 @@ class MemTable : public Table { const ::google::protobuf::RepeatedPtrField<::openmldb::api::BulkLoadIndex>& indexes); bool Delete(const ::openmldb::api::LogEntry& entry) override; - bool Delete(uint32_t idx, const std::string& key, - const std::optional& start_ts, const std::optional& end_ts); // use the first demission TableIterator* NewIterator(const std::string& pk, Ticket& ticket) override; @@ -111,6 +110,9 @@ class MemTable : public Table { bool CheckLatest(uint32_t index_id, const std::string& key, uint64_t ts); + bool Delete(uint32_t idx, const std::string& key, + const std::optional& start_ts, const std::optional& end_ts); + private: uint32_t seg_cnt_; std::vector segments_; diff --git a/src/storage/schema.cc b/src/storage/schema.cc index 7efff1d35a4..3250a047a8b 100644 --- a/src/storage/schema.cc +++ b/src/storage/schema.cc @@ -216,7 +216,7 @@ int TableIndex::ParseFromMeta(const ::openmldb::api::TableMeta& table_meta) { index->SetTsColumn(col_map[ts_name]); } else { // set default ts col - index->SetTsColumn(std::make_shared(DEFUALT_TS_COL_NAME, DEFUALT_TS_COL_ID, + index->SetTsColumn(std::make_shared(DEFAULT_TS_COL_NAME, DEFAULT_TS_COL_ID, ::openmldb::type::kTimestamp, true)); } if (column_key.has_ttl()) { @@ -232,7 +232,7 @@ int TableIndex::ParseFromMeta(const ::openmldb::api::TableMeta& table_meta) { // add default dimension if (indexs_->empty()) { auto index = std::make_shared("idx0", 0); - index->SetTsColumn(std::make_shared(DEFUALT_TS_COL_NAME, DEFUALT_TS_COL_ID, + index->SetTsColumn(std::make_shared(DEFAULT_TS_COL_NAME, DEFAULT_TS_COL_ID, ::openmldb::type::kTimestamp, true)); if (AddIndex(index) < 0) { DLOG(WARNING) << "add index failed"; diff --git a/src/storage/schema.h b/src/storage/schema.h index 2143744122e..7be52edfcb0 100644 --- a/src/storage/schema.h +++ b/src/storage/schema.h @@ -31,10 +31,16 @@ namespace openmldb::storage { static constexpr uint32_t MAX_INDEX_NUM = 200; -static constexpr uint32_t DEFUALT_TS_COL_ID = UINT32_MAX; -static constexpr const char* DEFUALT_TS_COL_NAME = "default_ts"; - -enum TTLType { kAbsoluteTime = 1, kRelativeTime = 2, kLatestTime = 3, kAbsAndLat = 4, kAbsOrLat = 5 }; +static constexpr uint32_t DEFAULT_TS_COL_ID = UINT32_MAX; +static constexpr const char* DEFAULT_TS_COL_NAME = "___default_ts___"; + +enum TTLType { + kAbsoluteTime = 1, + kRelativeTime = 2, + kLatestTime = 3, + kAbsAndLat = 4, + kAbsOrLat = 5 +}; // ttl unit: millisecond struct TTLSt { @@ -163,7 +169,7 @@ class ColumnDef { return false; } - inline bool IsAutoGenTs() const { return id_ == DEFUALT_TS_COL_ID; } + inline bool IsAutoGenTs() const { return id_ == DEFAULT_TS_COL_ID; } private: std::string name_; diff --git a/src/storage/schema_test.cc b/src/storage/schema_test.cc index 1c169697634..77840c13e93 100644 --- a/src/storage/schema_test.cc +++ b/src/storage/schema_test.cc @@ -233,9 +233,9 @@ TEST_F(SchemaTest, TsAndDefaultTs) { ::openmldb::storage::kAbsoluteTime); AssertIndex(*(table_index.GetIndex("key2")), "key2", "col1", "col7", 7, 10, 0, ::openmldb::storage::kAbsoluteTime); AssertIndex(*(table_index.GetIndex("key3")), "key3", "col2", "col6", 6, 10, 0, ::openmldb::storage::kAbsoluteTime); - AssertIndex(*(table_index.GetIndex("key4")), "key4", "col2", DEFUALT_TS_COL_NAME, DEFUALT_TS_COL_ID, + AssertIndex(*(table_index.GetIndex("key4")), "key4", "col2", DEFAULT_TS_COL_NAME, DEFAULT_TS_COL_ID, 10, 0, ::openmldb::storage::kAbsoluteTime); - AssertIndex(*(table_index.GetIndex("key5")), "key5", "col3", DEFUALT_TS_COL_NAME, DEFUALT_TS_COL_ID, + AssertIndex(*(table_index.GetIndex("key5")), "key5", "col3", DEFAULT_TS_COL_NAME, DEFAULT_TS_COL_ID, 10, 0, ::openmldb::storage::kAbsoluteTime); auto inner_index = table_index.GetAllInnerIndex(); ASSERT_EQ(inner_index->size(), 3u); @@ -243,10 +243,10 @@ TEST_F(SchemaTest, TsAndDefaultTs) { std::vector ts_vec0 = {6, 7}; AssertInnerIndex(*(table_index.GetInnerIndex(0)), 0, index0, ts_vec0); std::vector index1 = {"key3", "key4"}; - std::vector ts_vec1 = {6, DEFUALT_TS_COL_ID}; + std::vector ts_vec1 = {6, DEFAULT_TS_COL_ID}; AssertInnerIndex(*(table_index.GetInnerIndex(1)), 1, index1, ts_vec1); std::vector index2 = {"key5"}; - std::vector ts_vec2 = {DEFUALT_TS_COL_ID}; + std::vector ts_vec2 = {DEFAULT_TS_COL_ID}; AssertInnerIndex(*(table_index.GetInnerIndex(2)), 2, index2, ts_vec2); } diff --git a/src/storage/segment.cc b/src/storage/segment.cc index d79b6e85681..1734dc022c5 100644 --- a/src/storage/segment.cc +++ b/src/storage/segment.cc @@ -15,7 +15,9 @@ */ #include "storage/segment.h" + #include + #include #include "base/glog_wrapper.h" @@ -64,9 +66,7 @@ Segment::Segment(uint8_t height, const std::vector& ts_idx_vec) } } -Segment::~Segment() { - delete entries_; -} +Segment::~Segment() { delete entries_; } void Segment::Release(StatisticsInfo* statistics_info) { std::unique_ptr it(entries_->NewIterator()); @@ -98,9 +98,7 @@ void Segment::Release(StatisticsInfo* statistics_info) { } } -void Segment::ReleaseAndCount(StatisticsInfo* statistics_info) { - Release(statistics_info); -} +void Segment::ReleaseAndCount(StatisticsInfo* statistics_info) { Release(statistics_info); } void Segment::ReleaseAndCount(const std::vector& id_vec, StatisticsInfo* statistics_info) { if (ts_cnt_ <= 1) { @@ -135,25 +133,28 @@ void Segment::ReleaseAndCount(const std::vector& id_vec, StatisticsInfo* } } -void Segment::Put(const Slice& key, uint64_t time, const char* data, uint32_t size) { +void Segment::Put(const Slice& key, uint64_t time, const char* data, uint32_t size, bool put_if_absent, + bool check_all_time) { if (ts_cnt_ > 1) { return; } auto* db = new DataBlock(1, data, size); - Put(key, time, db); + Put(key, time, db, put_if_absent, check_all_time); } -void Segment::Put(const Slice& key, uint64_t time, DataBlock* row) { +bool Segment::Put(const Slice& key, uint64_t time, DataBlock* row, bool put_if_absent, bool check_all_time) { if (ts_cnt_ > 1) { - return; + LOG(ERROR) << "wrong call"; + return false; } std::lock_guard lock(mu_); - PutUnlock(key, time, row); + return PutUnlock(key, time, row, put_if_absent, check_all_time); } -void Segment::PutUnlock(const Slice& key, uint64_t time, DataBlock* row) { +bool Segment::PutUnlock(const Slice& key, uint64_t time, DataBlock* row, bool put_if_absent, bool check_all_time) { void* entry = nullptr; uint32_t byte_size = 0; + // one key just one entry int ret = entries_->Get(key, entry); if (ret < 0 || entry == nullptr) { char* pk = new char[key.size()]; @@ -164,12 +165,17 @@ void Segment::PutUnlock(const Slice& key, uint64_t time, DataBlock* row) { uint8_t height = entries_->Insert(skey, entry); byte_size += GetRecordPkIdxSize(height, key.size(), key_entry_max_height_); pk_cnt_.fetch_add(1, std::memory_order_relaxed); + // no need to check if absent when first put + } else if (put_if_absent && ListContains(reinterpret_cast(entry), time, row, check_all_time)) { + return false; } + idx_cnt_vec_[0]->fetch_add(1, std::memory_order_relaxed); uint8_t height = reinterpret_cast(entry)->entries.Insert(time, row); reinterpret_cast(entry)->count_.fetch_add(1, std::memory_order_relaxed); byte_size += GetRecordTsIdxSize(height); idx_byte_size_.fetch_add(byte_size, std::memory_order_relaxed); + return true; } void Segment::BulkLoadPut(unsigned int key_entry_id, const Slice& key, uint64_t time, DataBlock* row) { @@ -201,16 +207,17 @@ void Segment::BulkLoadPut(unsigned int key_entry_id, const Slice& key, uint64_t } } -void Segment::Put(const Slice& key, const std::map& ts_map, DataBlock* row) { - uint32_t ts_size = ts_map.size(); - if (ts_size == 0) { - return; +bool Segment::Put(const Slice& key, const std::map& ts_map, DataBlock* row, bool put_if_absent) { + if (ts_map.empty()) { + return false; } if (ts_cnt_ == 1) { + bool ret = false; if (auto pos = ts_map.find(ts_idx_map_.begin()->first); pos != ts_map.end()) { - Put(key, pos->second, row); + // TODO(hw): why ts_map key is int32_t, default ts is uint32_t? + ret = Put(key, pos->second, row, put_if_absent, pos->first == DEFAULT_TS_COL_ID); } - return; + return ret; } void* entry_arr = nullptr; std::lock_guard lock(mu_); @@ -237,19 +244,24 @@ void Segment::Put(const Slice& key, const std::map& ts_map, D } } auto entry = reinterpret_cast(entry_arr)[pos->second]; + if (put_if_absent && ListContains(entry, kv.second, row, pos->first == DEFAULT_TS_COL_ID)) { + return false; + } uint8_t height = entry->entries.Insert(kv.second, row); entry->count_.fetch_add(1, std::memory_order_relaxed); byte_size += GetRecordTsIdxSize(height); idx_byte_size_.fetch_add(byte_size, std::memory_order_relaxed); idx_cnt_vec_[pos->second]->fetch_add(1, std::memory_order_relaxed); } + return true; } bool Segment::Delete(const std::optional& idx, const Slice& key) { + uint32_t ts_idx = 0; + if (!GetTsIdx(idx, &ts_idx)) { + return false; + } if (ts_cnt_ == 1) { - if (idx.has_value() && ts_idx_map_.find(idx.value()) == ts_idx_map_.end()) { - return false; - } ::openmldb::base::Node* entry_node = nullptr; { std::lock_guard lock(mu_); @@ -260,13 +272,6 @@ bool Segment::Delete(const std::optional& idx, const Slice& key) { return true; } } else { - if (!idx.has_value()) { - return false; - } - auto iter = ts_idx_map_.find(idx.value()); - if (iter == ts_idx_map_.end()) { - return false; - } base::Node* data_node = nullptr; { std::lock_guard lock(mu_); @@ -274,7 +279,7 @@ bool Segment::Delete(const std::optional& idx, const Slice& key) { if (entries_->Get(key, entry_arr) < 0 || entry_arr == nullptr) { return true; } - KeyEntry* key_entry = reinterpret_cast(entry_arr)[iter->second]; + KeyEntry* key_entry = reinterpret_cast(entry_arr)[ts_idx]; std::unique_ptr it(key_entry->entries.NewIterator()); it->SeekToFirst(); if (it->Valid()) { @@ -283,25 +288,18 @@ bool Segment::Delete(const std::optional& idx, const Slice& key) { } } if (data_node != nullptr) { - node_cache_.AddValueNodeList(iter->second, gc_version_.load(std::memory_order_relaxed), data_node); + node_cache_.AddValueNodeList(ts_idx, gc_version_.load(std::memory_order_relaxed), data_node); } } return true; } -bool Segment::Delete(const std::optional& idx, const Slice& key, - uint64_t ts, const std::optional& end_ts) { - void* entry = nullptr; - if (entries_->Get(key, entry) < 0 || entry == nullptr) { - return true; - } - KeyEntry* key_entry = nullptr; - uint32_t ts_idx = 0; +bool Segment::GetTsIdx(const std::optional& idx, uint32_t* ts_idx) { + *ts_idx = 0; if (ts_cnt_ == 1) { if (idx.has_value() && ts_idx_map_.find(idx.value()) == ts_idx_map_.end()) { return false; } - key_entry = reinterpret_cast(entry); } else { if (!idx.has_value()) { return false; @@ -310,8 +308,55 @@ bool Segment::Delete(const std::optional& idx, const Slice& key, if (iter == ts_idx_map_.end()) { return false; } - key_entry = reinterpret_cast(entry)[iter->second]; - ts_idx = iter->second; + *ts_idx = iter->second; + } + return true; +} + +bool Segment::Delete(const std::optional& idx, const Slice& key, uint64_t ts) { + uint32_t ts_idx = 0; + if (!GetTsIdx(idx, &ts_idx)) { + return false; + } + void* entry = nullptr; + if (entries_->Get(key, entry) < 0 || entry == nullptr) { + return true; + } + KeyEntry* key_entry = nullptr; + if (ts_cnt_ == 1) { + key_entry = reinterpret_cast(entry); + } else { + key_entry = reinterpret_cast(entry)[ts_idx]; + } + base::Node* data_node = nullptr; + { + std::lock_guard lock(mu_); + data_node = key_entry->entries.Remove(ts); + } + if (data_node) { + node_cache_.AddSingleValueNode(ts_idx, gc_version_.load(std::memory_order_relaxed), data_node); + } + return true; +} + +bool Segment::Delete(const std::optional& idx, const Slice& key, + uint64_t ts, const std::optional& end_ts) { + if (end_ts.has_value() && end_ts.value() + 1 == ts) { + return Delete(idx, key, ts); + } + uint32_t ts_idx = 0; + if (!GetTsIdx(idx, &ts_idx)) { + return false; + } + void* entry = nullptr; + if (entries_->Get(key, entry) < 0 || entry == nullptr) { + return true; + } + KeyEntry* key_entry = nullptr; + if (ts_cnt_ == 1) { + key_entry = reinterpret_cast(entry); + } else { + key_entry = reinterpret_cast(entry)[ts_idx]; } if (end_ts.has_value()) { if (auto node = key_entry->entries.GetLast(); node == nullptr) { @@ -347,7 +392,7 @@ bool Segment::Delete(const std::optional& idx, const Slice& key, } void Segment::FreeList(uint32_t ts_idx, ::openmldb::base::Node* node, - StatisticsInfo* statistics_info) { + StatisticsInfo* statistics_info) { while (node != nullptr) { statistics_info->IncrIdxCnt(ts_idx); ::openmldb::base::Node* tmp = node; @@ -365,7 +410,6 @@ void Segment::FreeList(uint32_t ts_idx, ::openmldb::base::Node it(entry->entries.NewIterator()); + if (check_all_time) { + it->SeekToFirst(); + while (it->Valid()) { + if (it->GetValue()->EqualWithoutCnt(*row)) { + return true; + } + it->Next(); + } + } else { + // less than but desc time comparator, so it's <= time(not valid if empty or all > time), and get smaller by + // next + it->Seek(time); + while (it->Valid()) { + // key > time is just a protection, normally it should not happen + if (it->GetKey() < time || it->GetKey() > time) { + break; // no entry == time, or all entries == time have been checked + } + if (it->GetValue()->EqualWithoutCnt(*row)) { + return true; + } + it->Next(); + } + } + return false; +} + // fast gc with no global pause void Segment::Gc4TTL(const uint64_t time, StatisticsInfo* statistics_info) { uint64_t consumed = ::baidu::common::timer::get_micros(); @@ -606,8 +679,7 @@ void Segment::Gc4TTL(const uint64_t time, StatisticsInfo* statistics_info) { if (node == nullptr) { continue; } else if (node->GetKey() > time) { - DEBUGLOG("[Gc4TTL] segment gc with key %lu need not ttl, last node key %lu", - time, node->GetKey()); + DEBUGLOG("[Gc4TTL] segment gc with key %lu need not ttl, last node key %lu", time, node->GetKey()); continue; } node = nullptr; @@ -648,8 +720,7 @@ void Segment::Gc4TTLAndHead(const uint64_t time, const uint64_t keep_cnt, Statis if (node == nullptr) { continue; } else if (node->GetKey() > time) { - DEBUGLOG("[Gc4TTLAndHead] segment gc with key %lu need not ttl, last node key %lu", - time, node->GetKey()); + DEBUGLOG("[Gc4TTLAndHead] segment gc with key %lu need not ttl, last node key %lu", time, node->GetKey()); continue; } node = nullptr; @@ -663,8 +734,8 @@ void Segment::Gc4TTLAndHead(const uint64_t time, const uint64_t keep_cnt, Statis FreeList(0, node, statistics_info); entry->count_.fetch_sub(statistics_info->GetIdxCnt(0) - cur_idx_cnt, std::memory_order_relaxed); } - DEBUGLOG("[Gc4TTLAndHead] segment gc time %lu and keep cnt %lu consumed %lu, count %lu", - time, keep_cnt, (::baidu::common::timer::get_micros() - consumed) / 1000, statistics_info->GetIdxCnt(0) - old); + DEBUGLOG("[Gc4TTLAndHead] segment gc time %lu and keep cnt %lu consumed %lu, count %lu", time, keep_cnt, + (::baidu::common::timer::get_micros() - consumed) / 1000, statistics_info->GetIdxCnt(0) - old); idx_cnt_vec_[0]->fetch_sub(statistics_info->GetIdxCnt(0) - old, std::memory_order_relaxed); } @@ -709,8 +780,8 @@ void Segment::Gc4TTLOrHead(const uint64_t time, const uint64_t keep_cnt, Statist FreeList(0, node, statistics_info); entry->count_.fetch_sub(statistics_info->GetIdxCnt(0) - cur_idx_cnt, std::memory_order_relaxed); } - DEBUGLOG("[Gc4TTLAndHead] segment gc time %lu and keep cnt %lu consumed %lu, count %lu", - time, keep_cnt, (::baidu::common::timer::get_micros() - consumed) / 1000, statistics_info->GetIdxCnt(0) - old); + DEBUGLOG("[Gc4TTLAndHead] segment gc time %lu and keep cnt %lu consumed %lu, count %lu", time, keep_cnt, + (::baidu::common::timer::get_micros() - consumed) / 1000, statistics_info->GetIdxCnt(0) - old); idx_cnt_vec_[0]->fetch_sub(statistics_info->GetIdxCnt(0) - old, std::memory_order_relaxed); } @@ -754,8 +825,8 @@ MemTableIterator* Segment::NewIterator(const Slice& key, Ticket& ticket, type::C return new MemTableIterator(reinterpret_cast(entry)->entries.NewIterator(), compress_type); } -MemTableIterator* Segment::NewIterator(const Slice& key, uint32_t idx, - Ticket& ticket, type::CompressType compress_type) { +MemTableIterator* Segment::NewIterator(const Slice& key, uint32_t idx, Ticket& ticket, + type::CompressType compress_type) { auto pos = ts_idx_map_.find(idx); if (pos == ts_idx_map_.end()) { return new MemTableIterator(nullptr, compress_type); diff --git a/src/storage/segment.h b/src/storage/segment.h index fe58dd893a0..42f5ad6946c 100644 --- a/src/storage/segment.h +++ b/src/storage/segment.h @@ -70,18 +70,18 @@ class Segment { Segment(uint8_t height, const std::vector& ts_idx_vec); ~Segment(); - // Put time data - void Put(const Slice& key, uint64_t time, const char* data, uint32_t size); + // legacy interface called by memtable and ut + void Put(const Slice& key, uint64_t time, const char* data, uint32_t size, bool put_if_absent = false, + bool check_all_time = false); - void Put(const Slice& key, uint64_t time, DataBlock* row); - - void PutUnlock(const Slice& key, uint64_t time, DataBlock* row); + bool Put(const Slice& key, uint64_t time, DataBlock* row, bool put_if_absent = false, bool check_all_time = false); void BulkLoadPut(unsigned int key_entry_id, const Slice& key, uint64_t time, DataBlock* row); - - void Put(const Slice& key, const std::map& ts_map, DataBlock* row); + // main put method + bool Put(const Slice& key, const std::map& ts_map, DataBlock* row, bool put_if_absent = false); bool Delete(const std::optional& idx, const Slice& key); + bool Delete(const std::optional& idx, const Slice& key, uint64_t ts); bool Delete(const std::optional& idx, const Slice& key, uint64_t ts, const std::optional& end_ts); @@ -97,12 +97,10 @@ class Segment { void GcAllType(const std::map& ttl_st_map, StatisticsInfo* statistics_info); MemTableIterator* NewIterator(const Slice& key, Ticket& ticket, type::CompressType compress_type); // NOLINT - MemTableIterator* NewIterator(const Slice& key, uint32_t idx, - Ticket& ticket, type::CompressType compress_type); // NOLINT + MemTableIterator* NewIterator(const Slice& key, uint32_t idx, Ticket& ticket, // NOLINT + type::CompressType compress_type); - uint64_t GetIdxCnt() const { - return idx_cnt_vec_[0]->load(std::memory_order_relaxed); - } + uint64_t GetIdxCnt() const { return idx_cnt_vec_[0]->load(std::memory_order_relaxed); } int GetIdxCnt(uint32_t ts_idx, uint64_t& ts_cnt) { // NOLINT uint32_t real_idx = 0; @@ -145,9 +143,14 @@ class Segment { void ReleaseAndCount(const std::vector& id_vec, StatisticsInfo* statistics_info); private: - void FreeList(uint32_t ts_idx, ::openmldb::base::Node* node, - StatisticsInfo* statistics_info); + void FreeList(uint32_t ts_idx, ::openmldb::base::Node* node, StatisticsInfo* statistics_info); void SplitList(KeyEntry* entry, uint64_t ts, ::openmldb::base::Node** node); + bool GetTsIdx(const std::optional& idx, uint32_t* ts_idx); + + bool ListContains(KeyEntry* entry, uint64_t time, DataBlock* row, bool check_all_time); + + bool PutUnlock(const Slice& key, uint64_t time, DataBlock* row, bool put_if_absent = false, + bool check_all_time = false); private: KeyEntries* entries_; diff --git a/src/storage/segment_test.cc b/src/storage/segment_test.cc index c51c0984473..e43461c47e6 100644 --- a/src/storage/segment_test.cc +++ b/src/storage/segment_test.cc @@ -424,6 +424,82 @@ TEST_F(SegmentTest, TestDeleteRange) { CheckStatisticsInfo(CreateStatisticsInfo(20, 1012, 20 * (6 + sizeof(DataBlock))), gc_info); } +TEST_F(SegmentTest, PutIfAbsent) { + { + Segment segment(8); // so ts_cnt_ == 1 + // check all time == false + segment.Put("PK", 1, "test1", 5, true); + segment.Put("PK", 1, "test2", 5, true); // even key&time is the same, different value means different record + ASSERT_EQ(2, (int64_t)segment.GetIdxCnt()); + ASSERT_EQ(1, (int64_t)segment.GetPkCnt()); + segment.Put("PK", 2, "test3", 5, true); + segment.Put("PK", 2, "test4", 5, true); + segment.Put("PK", 3, "test5", 5, true); + segment.Put("PK", 3, "test6", 5, true); + ASSERT_EQ(6, (int64_t)segment.GetIdxCnt()); + // insert exists rows + segment.Put("PK", 2, "test3", 5, true); + segment.Put("PK", 1, "test1", 5, true); + segment.Put("PK", 1, "test2", 5, true); + segment.Put("PK", 3, "test6", 5, true); + ASSERT_EQ(6, (int64_t)segment.GetIdxCnt()); + // new rows + segment.Put("PK", 2, "test7", 5, true); + ASSERT_EQ(7, (int64_t)segment.GetIdxCnt()); + segment.Put("PK", 0, "test8", 5, true); // seek to last, next is empty + ASSERT_EQ(8, (int64_t)segment.GetIdxCnt()); + } + + { + // support when ts_cnt_ != 1 too + std::vector ts_idx_vec = {1, 3}; + Segment segment(8, ts_idx_vec); + ASSERT_EQ(2, (int64_t)segment.GetTsCnt()); + std::string key = "PK"; + uint64_t ts = 1669013677221000; + // the same ts + for (int j = 0; j < 2; j++) { + DataBlock* data = new DataBlock(2, key.c_str(), key.length()); + std::map ts_map = {{1, ts}, {3, ts}}; + segment.Put(Slice(key), ts_map, data, true); + } + ASSERT_EQ(1, GetCount(&segment, 1)); + ASSERT_EQ(1, GetCount(&segment, 3)); + } + + { + // put ts_map contains DEFAULT_TS_COL_ID + std::vector ts_idx_vec = {DEFAULT_TS_COL_ID}; + Segment segment(8, ts_idx_vec); + ASSERT_EQ(1, (int64_t)segment.GetTsCnt()); + std::string key = "PK"; + std::map ts_map = {{DEFAULT_TS_COL_ID, 100}}; // cur time == 100 + auto* block = new DataBlock(1, "test1", 5); + segment.Put(Slice(key), ts_map, block, true); + ASSERT_EQ(1, GetCount(&segment, DEFAULT_TS_COL_ID)); + ts_map = {{DEFAULT_TS_COL_ID, 200}}; + block = new DataBlock(1, "test1", 5); + segment.Put(Slice(key), ts_map, block, true); + ASSERT_EQ(1, GetCount(&segment, DEFAULT_TS_COL_ID)); + } + + { + // put ts_map contains DEFAULT_TS_COL_ID + std::vector ts_idx_vec = {DEFAULT_TS_COL_ID, 1, 3}; + Segment segment(8, ts_idx_vec); + ASSERT_EQ(3, (int64_t)segment.GetTsCnt()); + std::string key = "PK"; + std::map ts_map = {{DEFAULT_TS_COL_ID, 100}}; // cur time == 100 + auto* block = new DataBlock(1, "test1", 5); + segment.Put(Slice(key), ts_map, block, true); + ASSERT_EQ(1, GetCount(&segment, DEFAULT_TS_COL_ID)); + ts_map = {{DEFAULT_TS_COL_ID, 200}}; + block = new DataBlock(1, "test1", 5); + segment.Put(Slice(key), ts_map, block, true); + ASSERT_EQ(1, GetCount(&segment, DEFAULT_TS_COL_ID)); + } +} + } // namespace storage } // namespace openmldb diff --git a/src/storage/snapshot_test.cc b/src/storage/snapshot_test.cc index 910a8bc7724..e9dd679eafc 100644 --- a/src/storage/snapshot_test.cc +++ b/src/storage/snapshot_test.cc @@ -1085,7 +1085,7 @@ TEST_F(SnapshotTest, MakeSnapshotAbsOrLat) { SchemaCodec::SetColumnDesc(table_meta->add_column_desc(), "value", ::openmldb::type::kString); SchemaCodec::SetIndex(table_meta->add_column_key(), "index1", "card|merchant", "", ::openmldb::type::kAbsOrLat, 0, 1); - std::shared_ptr table = std::make_shared(*table_meta); + std::shared_ptr
table = std::make_shared(*table_meta); table->Init(); LogParts* log_part = new LogParts(12, 4, scmp); @@ -1119,7 +1119,7 @@ TEST_F(SnapshotTest, MakeSnapshotAbsOrLat) { google::protobuf::RepeatedPtrField<::openmldb::api::Dimension> d_list; ::openmldb::api::Dimension* d_ptr2 = d_list.Add(); d_ptr2->CopyFrom(dimensions); - ASSERT_EQ(table->Put(i + 1, *result, d_list), true); + ASSERT_EQ(table->Put(i + 1, *result, d_list).ok(), true); } table->SchedGc(); diff --git a/src/storage/table.h b/src/storage/table.h index 0766e4cf6c4..4c4a1f011f7 100644 --- a/src/storage/table.h +++ b/src/storage/table.h @@ -22,6 +22,7 @@ #include #include +#include "absl/status/status.h" #include "codec/codec.h" #include "proto/tablet.pb.h" #include "storage/iterator.h" @@ -50,17 +51,16 @@ class Table { int InitColumnDesc(); virtual bool Put(const std::string& pk, uint64_t time, const char* data, uint32_t size) = 0; + // DO NOT set different default value in derived class + virtual absl::Status Put(uint64_t time, const std::string& value, const Dimensions& dimensions, + bool put_if_absent = false) = 0; - virtual bool Put(uint64_t time, const std::string& value, const Dimensions& dimensions) = 0; - - bool Put(const ::openmldb::api::LogEntry& entry) { - return Put(entry.ts(), entry.value(), entry.dimensions()); - } + bool Put(const ::openmldb::api::LogEntry& entry) { return Put(entry.ts(), entry.value(), entry.dimensions()).ok(); } virtual bool Delete(const ::openmldb::api::LogEntry& entry) = 0; - virtual bool Delete(uint32_t idx, const std::string& key, - const std::optional& start_ts, const std::optional& end_ts) = 0; + virtual bool Delete(uint32_t idx, const std::string& key, const std::optional& start_ts, + const std::optional& end_ts) = 0; virtual TableIterator* NewIterator(const std::string& pk, Ticket& ticket) = 0; // NOLINT @@ -88,9 +88,7 @@ class Table { } return ""; } - inline ::openmldb::common::StorageMode GetStorageMode() const { - return storage_mode_; - } + inline ::openmldb::common::StorageMode GetStorageMode() const { return storage_mode_; } inline uint32_t GetId() const { return id_; } inline uint32_t GetIdxCnt() const { return table_index_.Size(); } @@ -173,7 +171,7 @@ class Table { virtual uint64_t GetRecordByteSize() const = 0; virtual uint64_t GetRecordIdxByteSize() = 0; - virtual int GetCount(uint32_t index, const std::string& pk, uint64_t& count) = 0; // NOLINT + virtual int GetCount(uint32_t index, const std::string& pk, uint64_t& count) = 0; // NOLINT protected: void UpdateTTL(); diff --git a/src/storage/table_iterator_test.cc b/src/storage/table_iterator_test.cc index 7ba932422e1..3af20940266 100644 --- a/src/storage/table_iterator_test.cc +++ b/src/storage/table_iterator_test.cc @@ -450,7 +450,7 @@ TEST_P(TableIteratorTest, SeekNonExistent) { ASSERT_EQ(0, now - wit->GetKey()); } -INSTANTIATE_TEST_CASE_P(TestMemAndHDD, TableIteratorTest, +INSTANTIATE_TEST_SUITE_P(TestMemAndHDD, TableIteratorTest, ::testing::Values(::openmldb::common::kMemory, ::openmldb::common::kHDD)); } // namespace storage diff --git a/src/storage/table_test.cc b/src/storage/table_test.cc index 251e92986c6..43b3508822e 100644 --- a/src/storage/table_test.cc +++ b/src/storage/table_test.cc @@ -198,7 +198,7 @@ TEST_P(TableTest, MultiDimissionPut0) { ::openmldb::codec::SDKCodec sdk_codec(meta); std::string result; sdk_codec.EncodeRow({"d0", "d1", "d2"}, &result); - bool ok = table->Put(1, result, dimensions); + bool ok = table->Put(1, result, dimensions).ok(); ASSERT_TRUE(ok); // some functions in disk table need to be implemented. // refer to issue #1238 @@ -808,7 +808,7 @@ TEST_P(TableTest, TableIteratorTS) { dim->set_key(row[1]); std::string value; ASSERT_EQ(0, codec.EncodeRow(row, &value)); - table->Put(0, value, request.dimensions()); + ASSERT_TRUE(table->Put(0, value, request.dimensions()).ok()); } TableIterator* it = table->NewTraverseIterator(0); it->SeekToFirst(); @@ -921,7 +921,7 @@ TEST_P(TableTest, TraverseIteratorCount) { dim->set_key(row[1]); std::string value; ASSERT_EQ(0, codec.EncodeRow(row, &value)); - table->Put(0, value, request.dimensions()); + ASSERT_TRUE(table->Put(0, value, request.dimensions()).ok()); } TableIterator* it = table->NewTraverseIterator(0); it->SeekToFirst(); @@ -1048,7 +1048,7 @@ TEST_P(TableTest, AbsAndLatSetGet) { dim->set_key("mcc"); std::string value; ASSERT_EQ(0, codec.EncodeRow(row, &value)); - table->Put(0, value, request.dimensions()); + ASSERT_TRUE(table->Put(0, value, request.dimensions()).ok()); } // test get and set ttl ASSERT_EQ(10, (int64_t)table->GetIndex(0)->GetTTL()->abs_ttl / (10 * 6000)); @@ -1149,7 +1149,7 @@ TEST_P(TableTest, AbsOrLatSetGet) { dim->set_key("mcc"); std::string value; ASSERT_EQ(0, codec.EncodeRow(row, &value)); - table->Put(0, value, request.dimensions()); + ASSERT_TRUE(table->Put(0, value, request.dimensions()).ok()); } // test get and set ttl ASSERT_EQ(10, (int64_t)table->GetIndex(0)->GetTTL()->abs_ttl / (10 * 6000)); @@ -1562,7 +1562,7 @@ TEST_P(TableTest, TraverseIteratorCountWithLimit) { dim->set_key(row[1]); std::string value; ASSERT_EQ(0, codec.EncodeRow(row, &value)); - table->Put(0, value, request.dimensions()); + ASSERT_TRUE(table->Put(0, value, request.dimensions()).ok()); } TableIterator* it = table->NewTraverseIterator(0); @@ -1669,7 +1669,7 @@ TEST_P(TableTest, TSColIDLength) { dim1->set_key(row[0]); std::string value; ASSERT_EQ(0, codec.EncodeRow(row, &value)); - table->Put(0, value, request.dimensions()); + ASSERT_TRUE(table->Put(0, value, request.dimensions()).ok()); } TableIterator* it = table->NewTraverseIterator(0); @@ -1727,7 +1727,7 @@ TEST_P(TableTest, MultiDimensionPutTS) { dim->set_key(row[1]); std::string value; ASSERT_EQ(0, codec.EncodeRow(row, &value)); - table->Put(0, value, request.dimensions()); + ASSERT_TRUE(table->Put(0, value, request.dimensions()).ok()); } TableIterator* it = table->NewTraverseIterator(0); it->SeekToFirst(); @@ -1781,7 +1781,7 @@ TEST_P(TableTest, MultiDimensionPutTS1) { dim->set_key(row[1]); std::string value; ASSERT_EQ(0, codec.EncodeRow(row, &value)); - table->Put(0, value, request.dimensions()); + ASSERT_TRUE(table->Put(0, value, request.dimensions()).ok()); } TableIterator* it = table->NewTraverseIterator(0); it->SeekToFirst(); @@ -1823,7 +1823,7 @@ TEST_P(TableTest, MultiDimissionPutTS2) { ::openmldb::codec::SDKCodec sdk_codec(meta); std::string result; sdk_codec.EncodeRow({"d0", "d1", "d2"}, &result); - bool ok = table->Put(100, result, dimensions); + bool ok = table->Put(100, result, dimensions).ok(); ASSERT_TRUE(ok); TableIterator* it = table->NewTraverseIterator(0); @@ -1885,7 +1885,7 @@ TEST_P(TableTest, AbsAndLat) { } std::string value; ASSERT_EQ(0, codec.EncodeRow(row, &value)); - table->Put(0, value, request.dimensions()); + ASSERT_TRUE(table->Put(0, value, request.dimensions()).ok()); } for (int i = 0; i <= 5; i++) { @@ -1938,10 +1938,11 @@ TEST_P(TableTest, NegativeTs) { dim->set_key(row[0]); std::string value; ASSERT_EQ(0, codec.EncodeRow(row, &value)); - ASSERT_FALSE(table->Put(0, value, request.dimensions())); + auto st = table->Put(0, value, request.dimensions()); + ASSERT_TRUE(absl::IsInvalidArgument(st)) << st.ToString(); } -INSTANTIATE_TEST_CASE_P(TestMemAndHDD, TableTest, +INSTANTIATE_TEST_SUITE_P(TestMemAndHDD, TableTest, ::testing::Values(::openmldb::common::kMemory, ::openmldb::common::kHDD)); } // namespace storage diff --git a/src/tablet/tablet_impl.cc b/src/tablet/tablet_impl.cc index 8691cf2f90b..42e68db04c2 100644 --- a/src/tablet/tablet_impl.cc +++ b/src/tablet/tablet_impl.cc @@ -102,6 +102,7 @@ DECLARE_bool(use_name); DECLARE_bool(enable_distsql); DECLARE_string(snapshot_compression); DECLARE_string(file_compression); +DECLARE_int32(request_timeout_ms); // cluster config DECLARE_string(endpoint); @@ -626,17 +627,9 @@ void TabletImpl::Get(RpcController* controller, const ::openmldb::api::GetReques } else { pid = request->pid(); } - std::shared_ptr
table = GetTable(tid, pid); - if (!table) { - PDLOG(WARNING, "table does not exist. tid %u, pid %u", tid, pid); - response->set_code(::openmldb::base::ReturnCode::kTableIsNotExist); - response->set_msg("table does not exist"); - return; - } - if (table->GetTableStat() == ::openmldb::storage::kLoading) { - PDLOG(WARNING, "table is loading. tid %u, pid %u", tid, pid); - response->set_code(::openmldb::base::ReturnCode::kTableIsLoading); - response->set_msg("table is loading"); + auto table = GetTable(tid, pid); + if (auto status = CheckTable(tid, pid, false, table); !status.OK()) { + SetResponseStatus(status, response); return; } std::string index_name; @@ -714,43 +707,21 @@ void TabletImpl::Put(RpcController* controller, const ::openmldb::api::PutReques } uint32_t tid = request->tid(); uint32_t pid = request->pid(); - uint64_t start_time = ::baidu::common::timer::get_micros(); - std::shared_ptr
table = GetTable(tid, pid); - if (!table) { - PDLOG(WARNING, "table does not exist. tid %u, pid %u", tid, pid); - response->set_code(::openmldb::base::ReturnCode::kTableIsNotExist); - response->set_msg("table does not exist"); + auto table = GetTable(tid, pid); + if (auto status = CheckTable(tid, pid, true, table); !status.OK()) { + SetResponseStatus(status, response); return; } + uint64_t start_time = ::baidu::common::timer::get_micros(); DLOG(INFO) << "request dimension size " << request->dimensions_size() << " request time " << request->time(); - if (!table->IsLeader()) { - response->set_code(::openmldb::base::ReturnCode::kTableIsFollower); - response->set_msg("table is follower"); - return; - } - if (table->GetTableStat() == ::openmldb::storage::kLoading) { - PDLOG(WARNING, "table is loading. tid %u, pid %u", tid, pid); - response->set_code(::openmldb::base::ReturnCode::kTableIsLoading); - response->set_msg("table is loading"); + if (table->GetStorageMode() == ::openmldb::common::StorageMode::kMemory && + memory_used_.load(std::memory_order_relaxed) > FLAGS_max_memory_mb) { + PDLOG(WARNING, "current memory %lu MB exceed max memory limit %lu MB. tid %u, pid %u", + memory_used_.load(std::memory_order_relaxed), FLAGS_max_memory_mb, tid, pid); + response->set_code(::openmldb::base::ReturnCode::kExceedMaxMemory); + response->set_msg("exceed max memory"); return; } - if (table->GetStorageMode() == ::openmldb::common::StorageMode::kMemory) { - if (memory_used_.load(std::memory_order_relaxed) > FLAGS_max_memory_mb) { - PDLOG(WARNING, "current memory %lu MB exceed max memory limit %lu MB. tid %u, pid %u", - memory_used_.load(std::memory_order_relaxed), FLAGS_max_memory_mb, tid, pid); - response->set_code(base::ReturnCode::kExceedMaxMemory); - response->set_msg("exceed max memory"); - return; - } - if (request->has_memory_limit() && request->memory_limit() > 0 - && system_memory_usage_rate_.load(std::memory_order_relaxed) > request->memory_limit()) { - PDLOG(WARNING, "current system_memory_usage_rate %u exceed request memory limit %u. tid %u, pid %u", - system_memory_usage_rate_.load(std::memory_order_relaxed), request->memory_limit(), tid, pid); - response->set_code(base::ReturnCode::kExceedPutMemoryLimit); - response->set_msg("exceed memory limit"); - return; - } - } ::openmldb::api::LogEntry entry; entry.set_pk(request->pk()); entry.set_ts(request->time()); @@ -767,7 +738,8 @@ void TabletImpl::Put(RpcController* controller, const ::openmldb::api::PutReques if (request->ts_dimensions_size() > 0) { entry.mutable_ts_dimensions()->CopyFrom(request->ts_dimensions()); } - bool ok = false; + + absl::Status st; if (request->dimensions_size() > 0) { int32_t ret_code = CheckDimessionPut(request, table->GetIdxCnt()); if (ret_code != 0) { @@ -776,16 +748,27 @@ void TabletImpl::Put(RpcController* controller, const ::openmldb::api::PutReques return; } DLOG(INFO) << "put data to tid " << tid << " pid " << pid << " with key " << request->dimensions(0).key(); - ok = table->Put(entry.ts(), entry.value(), entry.dimensions()); + // 1. normal put: ok, invalid data + // 2. put if absent: ok, exists but ignore, invalid data + st = table->Put(entry.ts(), entry.value(), entry.dimensions(), request->put_if_absent()); } - if (!ok) { + + if (!st.ok()) { + if (request->put_if_absent() && absl::IsAlreadyExists(st)) { + // not a failure but shounld't write log entry + response->set_code(::openmldb::base::ReturnCode::kOk); + response->set_msg("exists but ignore"); + return; + } + LOG(WARNING) << st.ToString(); response->set_code(::openmldb::base::ReturnCode::kPutFailed); - response->set_msg("put failed"); + response->set_msg(st.ToString()); return; } response->set_code(::openmldb::base::ReturnCode::kOk); std::shared_ptr replicator; + bool ok = false; do { replicator = GetReplicator(request->tid(), request->pid()); if (!replicator) { @@ -1147,17 +1130,9 @@ void TabletImpl::Scan(RpcController* controller, const ::openmldb::api::ScanRequ } else { pid = request->pid(); } - std::shared_ptr
table = GetTable(tid, pid); - if (!table) { - PDLOG(WARNING, "table does not exist. tid %u, pid %u", tid, pid); - response->set_code(::openmldb::base::ReturnCode::kTableIsNotExist); - response->set_msg("table does not exist"); - return; - } - if (table->GetTableStat() == ::openmldb::storage::kLoading) { - PDLOG(WARNING, "table is loading. tid %u, pid %u", tid, pid); - response->set_code(::openmldb::base::ReturnCode::kTableIsLoading); - response->set_msg("table is loading"); + auto table = GetTable(tid, pid); + if (auto status = CheckTable(tid, pid, false, table); !status.OK()) { + SetResponseStatus(status, response); return; } uint32_t index = 0; @@ -1241,17 +1216,11 @@ void TabletImpl::Scan(RpcController* controller, const ::openmldb::api::ScanRequ void TabletImpl::Count(RpcController* controller, const ::openmldb::api::CountRequest* request, ::openmldb::api::CountResponse* response, Closure* done) { brpc::ClosureGuard done_guard(done); - std::shared_ptr
table = GetTable(request->tid(), request->pid()); - if (!table) { - PDLOG(WARNING, "table does not exist. tid %u, pid %u", request->tid(), request->pid()); - response->set_code(::openmldb::base::ReturnCode::kTableIsNotExist); - response->set_msg("table does not exist"); - return; - } - if (table->GetTableStat() == ::openmldb::storage::kLoading) { - PDLOG(WARNING, "table is loading. tid %u, pid %u", request->tid(), request->pid()); - response->set_code(::openmldb::base::ReturnCode::kTableIsLoading); - response->set_msg("table is loading"); + uint32_t tid = request->tid(); + uint32_t pid = request->pid(); + auto table = GetTable(tid, pid); + if (auto status = CheckTable(tid, pid, false, table); !status.OK()) { + SetResponseStatus(status, response); return; } uint32_t index = 0; @@ -1265,8 +1234,7 @@ void TabletImpl::Count(RpcController* controller, const ::openmldb::api::CountRe } index_def = table->GetIndex(index_name); if (!index_def || !index_def->IsReady()) { - PDLOG(WARNING, "idx name %s not found in table tid %u, pid %u", request->idx_name().c_str(), request->tid(), - request->pid()); + PDLOG(WARNING, "idx name %s not found in table tid %u, pid %u", request->idx_name().c_str(), tid, pid); response->set_code(::openmldb::base::ReturnCode::kIdxNameNotFound); response->set_msg("idx name not found"); return; @@ -1325,17 +1293,9 @@ void TabletImpl::Traverse(RpcController* controller, const ::openmldb::api::Trav brpc::ClosureGuard done_guard(done); uint32_t tid = request->tid(); uint32_t pid = request->pid(); - std::shared_ptr
table = GetTable(tid, pid); - if (!table) { - PDLOG(WARNING, "table does not exist. tid %u, pid %u", tid, pid); - response->set_code(::openmldb::base::ReturnCode::kTableIsNotExist); - response->set_msg("table does not exist"); - return; - } - if (table->GetTableStat() == ::openmldb::storage::kLoading) { - PDLOG(WARNING, "table is loading. tid %u, pid %u", tid, pid); - response->set_code(::openmldb::base::ReturnCode::kTableIsLoading); - response->set_msg("table is loading"); + auto table = GetTable(tid, pid); + if (auto status = CheckTable(tid, pid, false, table); !status.OK()) { + SetResponseStatus(status, response); return; } std::string index_name; @@ -1446,6 +1406,140 @@ void TabletImpl::Traverse(RpcController* controller, const ::openmldb::api::Trav response->set_ts_pos(ts_pos); } +base::Status TabletImpl::CheckTable(uint32_t tid, uint32_t pid, bool check_leader, + const std::shared_ptr
& table) { + if (!table) { + PDLOG(WARNING, "table does not exist. tid %u, pid %u", tid, pid); + return {base::ReturnCode::kTableIsNotExist, "table does not exist"}; + } + if (check_leader && !table->IsLeader()) { + DEBUGLOG("table is follower. tid %u, pid %u", tid, pid); + return {base::ReturnCode::kTableIsFollower, "table is follower"}; + } + if (table->GetTableStat() == ::openmldb::storage::kLoading) { + PDLOG(WARNING, "table is loading. tid %u, pid %u", tid, pid); + return {base::ReturnCode::kTableIsLoading, "table is loading"}; + } + return {}; +} + +base::Status TabletImpl::DeleteAllIndex(const std::shared_ptr& table, + const std::shared_ptr& cur_index, + const std::string& key, + std::optional start_ts, + std::optional end_ts, + bool skip_cur_ts_col, + const std::shared_ptr& client_manager, + uint32_t partition_num) { + storage::Ticket ticket; + std::unique_ptr iter(table->NewIterator(cur_index->GetId(), key, ticket)); + if (start_ts.has_value()) { + iter->Seek(start_ts.value()); + } else { + iter->SeekToFirst(); + } + auto indexs = table->GetAllIndex(); + while (iter->Valid()) { + DEBUGLOG("cur ts %lu cur index pos %u", iter->GetKey(), cur_index->GetId()); + if (end_ts.has_value() && iter->GetKey() <= end_ts.value()) { + break; + } + auto value = iter->GetValue(); + uint32_t data_length = value.size(); + const int8_t* data = reinterpret_cast(value.data()); + std::string uncompress_data; + if (table->GetCompressType() == openmldb::type::kSnappy) { + snappy::Uncompress(value.data(), value.size(), &uncompress_data); + data = reinterpret_cast(uncompress_data.data()); + data_length = uncompress_data.length(); + } + if (data_length < codec::HEADER_LENGTH) { + return {base::ReturnCode::kDeleteFailed, "invalid value"}; + } + uint8_t version = codec::RowView::GetSchemaVersion(data); + auto decoder = table->GetVersionDecoder(version); + if (decoder == nullptr) { + return {base::ReturnCode::kDeleteFailed, "invalid schema version"}; + } + for (const auto& index : indexs) { + if (!index->IsReady()) { + continue; + } + if (cur_index && index->GetId() == cur_index->GetId()) { + continue; + } + auto ts_col = index->GetTsColumn(); + if (skip_cur_ts_col && ts_col->GetId() == cur_index->GetTsColumn()->GetId()) { + continue; + } + sdk::DeleteOption option; + option.idx = index->GetId(); + if (ts_col->IsAutoGenTs()) { + option.start_ts = iter->GetKey(); + } else { + int64_t ts = 0; + if (decoder->GetInteger(data, ts_col->GetId(), ts_col->GetType(), &ts) != 0) { + return {base::ReturnCode::kDeleteFailed, "get ts value failed"}; + } + option.ts_name = ts_col->GetName(); + option.start_ts = ts; + } + if (option.start_ts.value() > 1) { + option.end_ts = option.start_ts.value() - 1; + } + const auto& cols = index->GetColumns(); + if (cols.size() == 1) { + const auto& col = cols.front(); + if (decoder->IsNULL(data, col.GetId())) { + option.key = hybridse::codec::NONETOKEN; + } else if (decoder->GetStrValue(data, col.GetId(), &option.key) != 0) { + return {base::ReturnCode::kDeleteFailed, "get key failed"}; + } + if (option.key.empty()) { + option.key = hybridse::codec::EMPTY_STRING; + } + } else { + for (const auto& col : cols) { + std::string tmp; + if (decoder->IsNULL(data, col.GetId())) { + tmp = hybridse::codec::NONETOKEN; + } else if (decoder->GetStrValue(data, col.GetId(), &tmp) != 0) { + return {base::ReturnCode::kDeleteFailed, "get key failed"}; + } + if (tmp.empty()) { + tmp = hybridse::codec::EMPTY_STRING; + } + if (!option.key.empty()) { + option.key.append("|"); + } + option.key.append(tmp); + } + } + uint32_t cur_pid = static_cast(base::hash64(option.key)) % partition_num; + auto tablet = client_manager->GetTablet(cur_pid); + if (tablet == nullptr) { + return {base::ReturnCode::kDeleteFailed, absl::StrCat("tablet is nullptr, pid ", cur_pid)}; + } + auto client = tablet->GetClient(); + if (client == nullptr) { + return {base::ReturnCode::kDeleteFailed, absl::StrCat("client is nullptr, pid ", cur_pid)}; + } + DEBUGLOG("delete idx %u pid %u pk %s ts %lu end_ts %lu", + option.idx.value(), cur_pid, option.key.c_str(), option.start_ts.value(), option.end_ts.value()); + std::string msg; + // do not delete other index data + option.enable_decode_value = false; + if (auto status = client->Delete(table->GetId(), cur_pid, option, FLAGS_request_timeout_ms); !status.OK()) { + return {base::ReturnCode::kDeleteFailed, + absl::StrCat("delete failed. key ", option.key, " pid ", cur_pid, " msg: ", status.GetMsg())}; + } + } + + iter->Next(); + } + return {}; +} + void TabletImpl::Delete(RpcController* controller, const ::openmldb::api::DeleteRequest* request, openmldb::api::GeneralResponse* response, Closure* done) { brpc::ClosureGuard done_guard(done); @@ -1456,23 +1550,9 @@ void TabletImpl::Delete(RpcController* controller, const ::openmldb::api::Delete response->set_msg("is follower cluster"); return; } - std::shared_ptr
table = GetTable(tid, pid); - if (!table) { - PDLOG(WARNING, "table does not exist. tid %u, pid %u", tid, pid); - response->set_code(::openmldb::base::ReturnCode::kTableIsNotExist); - response->set_msg("table does not exist"); - return; - } - if (!table->IsLeader()) { - DEBUGLOG("table is follower. tid %u, pid %u", tid, pid); - response->set_code(::openmldb::base::ReturnCode::kTableIsFollower); - response->set_msg("table is follower"); - return; - } - if (table->GetTableStat() == ::openmldb::storage::kLoading) { - PDLOG(WARNING, "table is loading. tid %u, pid %u", tid, pid); - response->set_code(::openmldb::base::ReturnCode::kTableIsLoading); - response->set_msg("table is loading"); + auto table = GetTable(tid, pid); + if (auto status = CheckTable(tid, pid, true, table); !status.OK()) { + SetResponseStatus(status, response); return; } auto replicator = GetReplicator(tid, pid); @@ -1483,21 +1563,24 @@ void TabletImpl::Delete(RpcController* controller, const ::openmldb::api::Delete ::openmldb::api::LogEntry entry; entry.set_term(replicator->GetLeaderTerm()); entry.set_method_type(::openmldb::api::MethodType::kDelete); - uint32_t idx = 0; if (request->dimensions_size() > 0) { - entry.mutable_dimensions()->CopyFrom(request->dimensions()); - idx = entry.dimensions(0).idx(); - } else { - if (request->has_idx_name() && request->idx_name().size() > 0) { - std::shared_ptr index_def = table->GetIndex(request->idx_name()); - if (!index_def || !index_def->IsReady()) { - PDLOG(WARNING, "idx name %s not found in table tid %u, pid %u", request->idx_name().c_str(), tid, pid); - response->set_code(::openmldb::base::ReturnCode::kIdxNameNotFound); - response->set_msg("idx name not found"); - return; - } - idx = index_def->GetId(); + entry.add_dimensions()->CopyFrom(request->dimensions(0)); + auto index_def = table->GetIndex(request->dimensions(0).idx()); + if (!index_def || !index_def->IsReady()) { + PDLOG(WARNING, "index %s not found in table tid %u, pid %u", request->dimensions(0).idx(), tid, pid); + response->set_code(::openmldb::base::ReturnCode::kIdxNameNotFound); + response->set_msg("index not found"); + return; + } + } else if (request->has_idx_name() && !request->idx_name().empty()) { + auto index_def = table->GetIndex(request->idx_name()); + if (!index_def || !index_def->IsReady()) { + PDLOG(WARNING, "idx name %s not found in table tid %u, pid %u", request->idx_name().c_str(), tid, pid); + response->set_code(::openmldb::base::ReturnCode::kIdxNameNotFound); + response->set_msg("index not found"); + return; } + uint32_t idx = index_def->GetId(); if (request->has_key()) { auto dimension = entry.add_dimensions(); dimension->set_key(request->key()); @@ -1519,8 +1602,27 @@ void TabletImpl::Delete(RpcController* controller, const ::openmldb::api::Delete PDLOG(WARNING, "invalid args. tid %u, pid %u", tid, pid); return; } + bool delete_others = false; + if (request->has_enable_decode_value() && request->enable_decode_value()) { + auto indexs = table->GetAllIndex(); + if (entry.dimensions_size() > 0) { + if (indexs.size() > 1) { + delete_others = true; + } + } else if (request->has_ts_name()) { + for (const auto& index : indexs) { + if (!index->IsReady()) { + continue; + } + if (index->GetTsColumn()->GetName() != request->ts_name()) { + delete_others = true; + break; + } + } + } + } auto aggrs = GetAggregators(tid, pid); - if (!aggrs) { + if (!aggrs && !delete_others) { if (table->Delete(entry)) { DEBUGLOG("delete ok. tid %u, pid %u, key %s", tid, pid, request->key().c_str()); } else { @@ -1529,7 +1631,7 @@ void TabletImpl::Delete(RpcController* controller, const ::openmldb::api::Delete return; } } else { - auto get_aggregator = [this](std::shared_ptr aggrs, uint32_t idx) -> std::shared_ptr { + auto get_aggregator = [this](const std::shared_ptr& aggrs, uint32_t idx) -> std::shared_ptr { if (aggrs) { for (const auto& aggr : *aggrs) { if (aggr->GetIndexPos() == idx) { @@ -1541,36 +1643,74 @@ void TabletImpl::Delete(RpcController* controller, const ::openmldb::api::Delete }; std::optional start_ts = entry.has_ts() ? std::optional{entry.ts()} : std::nullopt; std::optional end_ts = entry.has_end_ts() ? std::optional{entry.end_ts()} : std::nullopt; + auto handler = catalog_->GetTable(table->GetDB(), table->GetName()); + if (!handler) { + response->set_code(::openmldb::base::ReturnCode::kDeleteFailed); + response->set_msg("no TableHandler"); + PDLOG(WARNING, "no TableHandler. tid %u, pid %u", tid, pid); + return; + } + auto tablet_table_handler = std::dynamic_pointer_cast(handler); + if (!tablet_table_handler) { + response->set_code(::openmldb::base::ReturnCode::kDeleteFailed); + response->set_msg("convert TabletTableHandler failed"); + PDLOG(WARNING, "convert TabletTableHandler failed. tid %u, pid %u", tid, pid); + return; + } + uint32_t pid_num = tablet_table_handler->GetPartitionNum(); + auto table_client_manager = tablet_table_handler->GetTableClientManager(); if (entry.dimensions_size() > 0) { - for (const auto& dimension : entry.dimensions()) { - if (!table->Delete(dimension.idx(), dimension.key(), start_ts, end_ts)) { - response->set_code(::openmldb::base::ReturnCode::kDeleteFailed); - response->set_msg("delete failed"); + const auto& dimension = entry.dimensions(0); + uint32_t idx = dimension.idx(); + auto index_def = table->GetIndex(idx); + const auto& key = dimension.key(); + if (delete_others) { + auto status = DeleteAllIndex(table, index_def, key, start_ts, end_ts, false, + table_client_manager, pid_num); + if (!status.OK()) { + SET_RESP_AND_WARN(response, status.GetCode(), status.GetMsg()); return; } - auto aggr = get_aggregator(aggrs, dimension.idx()); - if (aggr) { - if (!aggr->Delete(dimension.key(), start_ts, end_ts)) { - PDLOG(WARNING, "delete from aggr failed. base table: tid[%u] pid[%u] index[%u] key[%s]. " - "aggr table: tid[%u]", - tid, pid, idx, dimension.key().c_str(), aggr->GetAggrTid()); - response->set_code(::openmldb::base::ReturnCode::kDeleteFailed); - response->set_msg("delete from associated pre-aggr table failed"); - return; - } + } + if (!table->Delete(idx, key, start_ts, end_ts)) { + response->set_code(::openmldb::base::ReturnCode::kDeleteFailed); + response->set_msg("delete failed"); + return; + } + auto aggr = get_aggregator(aggrs, idx); + if (aggr) { + if (!aggr->Delete(key, start_ts, end_ts)) { + PDLOG(WARNING, "delete from aggr failed. base table: tid[%u] pid[%u] index[%u] key[%s]. " + "aggr table: tid[%u]", + tid, pid, idx, key.c_str(), aggr->GetAggrTid()); + response->set_code(::openmldb::base::ReturnCode::kDeleteFailed); + response->set_msg("delete from associated pre-aggr table failed"); + return; } - DEBUGLOG("delete ok. tid %u, pid %u, key %s", tid, pid, dimension.key().c_str()); } + DEBUGLOG("delete ok. tid %u, pid %u, key %s", tid, pid, key.c_str()); } else { + bool is_first_hit_index = true; for (const auto& index_def : table->GetAllIndex()) { if (!index_def || !index_def->IsReady()) { continue; } + if (index_def->GetTsColumn()->GetName() != request->ts_name()) { + continue; + } uint32_t idx = index_def->GetId(); std::unique_ptr iter(table->NewTraverseIterator(idx)); iter->SeekToFirst(); while (iter->Valid()) { auto pk = iter->GetPK(); + if (delete_others && is_first_hit_index) { + auto status = DeleteAllIndex(table, index_def, pk, start_ts, end_ts, true, + table_client_manager, pid_num); + if (!status.OK()) { + SET_RESP_AND_WARN(response, status.GetCode(), status.GetMsg()); + return; + } + } iter->NextPK(); if (!table->Delete(idx, pk, start_ts, end_ts)) { response->set_code(::openmldb::base::ReturnCode::kDeleteFailed); @@ -1588,6 +1728,7 @@ void TabletImpl::Delete(RpcController* controller, const ::openmldb::api::Delete } } } + is_first_hit_index = false; } } } @@ -1934,16 +2075,9 @@ void TabletImpl::ChangeRole(RpcController* controller, const ::openmldb::api::Ch brpc::ClosureGuard done_guard(done); uint32_t tid = request->tid(); uint32_t pid = request->pid(); - std::shared_ptr
table = GetTable(tid, pid); - if (!table) { - response->set_code(::openmldb::base::ReturnCode::kTableIsNotExist); - response->set_msg("table does not exist"); - return; - } - if (table->GetTableStat() != ::openmldb::storage::kNormal) { - PDLOG(WARNING, "table state[%u] can not change role. tid[%u] pid[%u]", table->GetTableStat(), tid, pid); - response->set_code(::openmldb::base::ReturnCode::kTableStatusIsNotKnormal); - response->set_msg("table status is not kNormal"); + auto table = GetTable(tid, pid); + if (auto status = CheckTable(tid, pid, false, table); !status.OK()) { + SetResponseStatus(status, response); return; } std::shared_ptr replicator = GetReplicator(tid, pid); @@ -2157,11 +2291,9 @@ void TabletImpl::AppendEntries(RpcController* controller, const ::openmldb::api: brpc::ClosureGuard done_guard(done); uint32_t tid = request->tid(); uint32_t pid = request->pid(); - std::shared_ptr
table = GetTable(tid, pid); - if (!table) { - PDLOG(WARNING, "table does not exist. tid %u, pid %u", tid, pid); - response->set_code(::openmldb::base::ReturnCode::kTableIsNotExist); - response->set_msg("table does not exist"); + auto table = GetTable(tid, pid); + if (auto status = CheckTable(tid, pid, false, table); !status.OK()) { + SetResponseStatus(status, response); return; } if (!follower_.load(std::memory_order_relaxed) && table->IsLeader()) { @@ -2170,12 +2302,6 @@ void TabletImpl::AppendEntries(RpcController* controller, const ::openmldb::api: response->set_msg("table is leader"); return; } - if (table->GetTableStat() == ::openmldb::storage::kLoading) { - response->set_code(::openmldb::base::ReturnCode::kTableIsLoading); - response->set_msg("table is loading"); - PDLOG(WARNING, "table is loading. tid %u, pid %u", tid, pid); - return; - } std::shared_ptr replicator = GetReplicator(tid, pid); if (!replicator) { response->set_code(::openmldb::base::ReturnCode::kReplicatorIsNotExist); @@ -2220,9 +2346,8 @@ void TabletImpl::AppendEntries(RpcController* controller, const ::openmldb::api: return; } if (entry.has_method_type() && entry.method_type() == ::openmldb::api::MethodType::kDelete) { - table->Delete(entry); - } - if (!table->Put(entry)) { + table->Delete(entry); // TODO(hw): error handle + } else if (!table->Put(entry)) { // put if type is not delete PDLOG(WARNING, "fail to put entry. tid %u pid %u", tid, pid); response->set_code(::openmldb::base::ReturnCode::kFailToAppendEntriesToReplicator); response->set_msg("fail to append entry to table"); diff --git a/src/tablet/tablet_impl.h b/src/tablet/tablet_impl.h index 833dbe5ff70..c6ea7cd2b21 100644 --- a/src/tablet/tablet_impl.h +++ b/src/tablet/tablet_impl.h @@ -433,9 +433,18 @@ class TabletImpl : public ::openmldb::api::TabletServer { openmldb::api::QueryResponse& response, butil::IOBuf& buf); // NOLINT void CreateProcedure(const std::shared_ptr& sp_info); + base::Status CheckTable(uint32_t tid, uint32_t pid, bool check_leader, const std::shared_ptr
& table); // refresh the pre-aggr tables info bool RefreshAggrCatalog(); + base::Status DeleteAllIndex(const std::shared_ptr& table, + const std::shared_ptr& cur_index, + const std::string& key, + std::optional start_ts, + std::optional end_ts, + bool skip_cur_ts_col, + const std::shared_ptr& client_manager, + uint32_t partition_num); void UpdateMemoryUsage(); diff --git a/src/tablet/tablet_impl_func_test.cc b/src/tablet/tablet_impl_func_test.cc index c84729f288d..c07084a396d 100644 --- a/src/tablet/tablet_impl_func_test.cc +++ b/src/tablet/tablet_impl_func_test.cc @@ -89,7 +89,7 @@ void CreateBaseTable(::openmldb::storage::Table*& table, // NOLINT dim->set_key(row[1]); std::string value; ASSERT_EQ(0, codec.EncodeRow(row, &value)); - ASSERT_TRUE(table->Put(0, value, request.dimensions())); + ASSERT_TRUE(table->Put(0, value, request.dimensions()).ok()); } return; } @@ -389,7 +389,7 @@ TEST_P(TabletFuncTest, GetTimeIndex_ts1_iterator) { RunGetTimeIndexAssert(&query_its, base_ts, base_ts - 100); } -INSTANTIATE_TEST_CASE_P(TabletMemAndHDD, TabletFuncTest, +INSTANTIATE_TEST_SUITE_P(TabletMemAndHDD, TabletFuncTest, ::testing::Values(::openmldb::common::kMemory, ::openmldb::common::kHDD, ::openmldb::common::kSSD)); diff --git a/src/tablet/tablet_impl_test.cc b/src/tablet/tablet_impl_test.cc index 1a2de9e66d8..ec9773ce6b3 100644 --- a/src/tablet/tablet_impl_test.cc +++ b/src/tablet/tablet_impl_test.cc @@ -176,6 +176,7 @@ void AddDefaultSchema(uint64_t abs_ttl, uint64_t lat_ttl, ::openmldb::type::TTLT } void AddDefaultAggregatorBaseSchema(::openmldb::api::TableMeta* table_meta) { + table_meta->set_db("db1"); table_meta->set_name("t0"); table_meta->set_pid(1); table_meta->set_mode(::openmldb::api::TableMode::kTableLeader); @@ -191,6 +192,7 @@ void AddDefaultAggregatorBaseSchema(::openmldb::api::TableMeta* table_meta) { } void AddDefaultAggregatorSchema(::openmldb::api::TableMeta* table_meta) { + table_meta->set_db("db1"); table_meta->set_name("pre_aggr_1"); table_meta->set_pid(1); table_meta->set_mode(::openmldb::api::TableMode::kTableLeader); diff --git a/src/tools/data_exporter.cc b/src/tools/data_exporter.cc index fd089281fd6..a20d773abc6 100644 --- a/src/tools/data_exporter.cc +++ b/src/tools/data_exporter.cc @@ -92,9 +92,9 @@ int main(int argc, char* argv[]) { } else { std::string zk_cluster, zk_root_path; ReadZKFromYaml(FLAGS_config_path, &zk_cluster, &zk_root_path); - ::openmldb::sdk::ClusterOptions cluster_options; - cluster_options.zk_cluster = zk_cluster; - cluster_options.zk_path = zk_root_path; + auto cluster_options = std::make_shared<::openmldb::sdk::SQLRouterOptions>(); + cluster_options->zk_cluster = zk_cluster; + cluster_options->zk_path = zk_root_path; tablemeta_reader = new ::openmldb::tools::ClusterTablemetaReader(FLAGS_db_name, FLAGS_table_name, tablet_map, cluster_options); } diff --git a/src/tools/tablemeta_reader.cc b/src/tools/tablemeta_reader.cc index d7f6ae72638..c32aa511c24 100644 --- a/src/tools/tablemeta_reader.cc +++ b/src/tools/tablemeta_reader.cc @@ -123,7 +123,7 @@ std::string TablemetaReader::ReadDBRootPath(const std::string& deploy_dir, const } void StandaloneTablemetaReader::SetTableinfoPtr() { - ::openmldb::sdk::StandAloneSDK standalone_sdk(host_, port_); + ::openmldb::sdk::StandAloneSDK standalone_sdk(options_); standalone_sdk.Init(); tableinfo_ptr_ = standalone_sdk.GetTableInfo(db_name_, table_name_); } diff --git a/src/tools/tablemeta_reader.h b/src/tools/tablemeta_reader.h index 20de072bf68..a3faf1b54a9 100644 --- a/src/tools/tablemeta_reader.h +++ b/src/tools/tablemeta_reader.h @@ -80,7 +80,8 @@ class TablemetaReader { class ClusterTablemetaReader : public TablemetaReader { public: ClusterTablemetaReader(const std::string &db_name, const std::string &table_name, - std::unordered_map tablet_map, const ClusterOptions& options) : + std::unordered_map tablet_map, + const std::shared_ptr& options) : TablemetaReader(db_name, table_name, tablet_map), options_(options) {} void SetTableinfoPtr() override; @@ -88,7 +89,7 @@ class ClusterTablemetaReader : public TablemetaReader { bool IsClusterMode() const override { return true; } private: - ClusterOptions options_; + std::shared_ptr options_; }; @@ -96,15 +97,16 @@ class StandaloneTablemetaReader : public TablemetaReader { public: StandaloneTablemetaReader(const std::string &db_name, const std::string &table_name, std::unordered_map tablet_map, const std::string &host, int port) : - TablemetaReader(db_name, table_name, tablet_map), host_(host), port_(port) {} + TablemetaReader(db_name, table_name, tablet_map) { + options_ = std::make_shared(host, port); + } void SetTableinfoPtr() override; bool IsClusterMode() const override { return false; } private: - std::string host_; - uint32_t port_; + std::shared_ptr options_; }; } // namespace tools diff --git a/test/integration-test/README b/test/integration-test/README new file mode 100644 index 00000000000..1b250620412 --- /dev/null +++ b/test/integration-test/README @@ -0,0 +1,31 @@ +# README + +## 参数配置脚本 + +``` +test/format_config.sh {openmldbPath} {jobName} {portFrom} {portTo} {type} {Dependency} +``` +配置生成在 out/openmldb_info.yaml + +Dependency为hadoop时使用hadoop config + +Dependency为ssd时禁止部署在node-1 + +## 一键运行脚本 +``` +test/openmldb-integration-test.sh -c {caseXML} -d {deployMode} -j {jarVersion} -l {caseLevel} -s {tableStorageMode} -m {executeMode} +``` + +### 版本兼容性测试 +openmldb包版本设置: test/format_config.sh {openmldbPath} + +jar版本设置: test/openmldb-integration-test.sh -j {jarVersion} + +### kafka测试 +``` +test/integration-test/openmldb-test-java/kafka_test.sh +``` +环境配置于node-4: kakfa_test(docker) + +## 回归测试workflow +hadoop配置文件位于 /mnt/hdd0/denglong/openmldb_runner_work/hadoop \ No newline at end of file diff --git a/test/integration-test/openmldb-test-java/kafka_test.sh b/test/integration-test/openmldb-test-java/kafka_test.sh new file mode 100755 index 00000000000..ff585a4f93c --- /dev/null +++ b/test/integration-test/openmldb-test-java/kafka_test.sh @@ -0,0 +1,18 @@ +#! /bin/bash + +apiserver="$(awk -F '"' 'NR==7{print $2}' ../../../out/openmldb_info.yaml)" +zkc="zk=""$(awk -F '"' 'NR==2{print $2}' ../../../out/openmldb_info.yaml)" +zkpath="\&zkPath=""$(awk -F '"' 'NR==3{print $2}' ../../../out/openmldb_info.yaml)" +echo "${zkc}" +echo "${zkpath}" +zk="${zkc}""${zkpath}" + +docker exec -it kafka_test /start.sh "${zk}" +sed -i "s#\"bootstrap.servers\":.*#\"bootstrap.servers\":node-4:9092,#" openmldb-ecosystem/src/test/resources/kafka_test_cases.yml +sed -i "s#\"connect.listeners\":.*#\"connect.listeners\":http://node-4:8083,#" openmldb-ecosystem/src/test/resources/kafka_test_cases.yml +sed -i "s#apiserver.address:.*#apiserver.address: ${apiserver}#" openmldb-ecosystem/src/test/resources/kafka_test_cases.yml +sed -i "s#kafka_test?.*#kafka_test?${zk}\"#" openmldb-ecosystem/src/test/resources/kafka_test_cases.yml + +mvn test -pl openmldb-ecosystem + +docker exec -it kafka_test /stop.sh diff --git a/test/integration-test/openmldb-test-java/openmldb-http-test/pom.xml b/test/integration-test/openmldb-test-java/openmldb-http-test/pom.xml index 6a870c1d40d..bddd64a4c6a 100644 --- a/test/integration-test/openmldb-test-java/openmldb-http-test/pom.xml +++ b/test/integration-test/openmldb-test-java/openmldb-http-test/pom.xml @@ -24,6 +24,29 @@ openmldb-sdk-test ${project.version} + + org.uncommons + reportng + 1.1.4 + test + + + org.testng + testng + + + + + io.qameta.allure + allure-testng + 2.12.1 + + + com.google.inject + guice + 4.0 + test + @@ -31,10 +54,10 @@ org.apache.maven.plugins maven-surefire-plugin - 2.20.1 + 2.22.1 false - 1 + 2 ${suiteXmlFile} @@ -44,20 +67,16 @@ target/ - - - - org.aspectj - aspectjweaver - ${aspectj.version} - - - - - org.apache.maven.plugins - maven-surefire-plugin - 2.20.1 - + + + usedefaultlisteners + false + + + listener + org.uncommons.reportng.HTMLReporter, org.uncommons.reportng.JUnitXMLReporter + + caseName @@ -98,6 +117,13 @@ + + + org.aspectj + aspectjweaver + ${aspectj.version} + + diff --git a/test/integration-test/openmldb-test-java/openmldb-http-test/src/main/java/com/_4paradigm/openmldb/http_test/check/DataChecker.java b/test/integration-test/openmldb-test-java/openmldb-http-test/src/main/java/com/_4paradigm/openmldb/http_test/check/DataChecker.java index 0799e00e459..81098102ee9 100644 --- a/test/integration-test/openmldb-test-java/openmldb-http-test/src/main/java/com/_4paradigm/openmldb/http_test/check/DataChecker.java +++ b/test/integration-test/openmldb-test-java/openmldb-http-test/src/main/java/com/_4paradigm/openmldb/http_test/check/DataChecker.java @@ -35,7 +35,7 @@ public void check() throws Exception { if(MapUtils.isEmpty(data)){ return ; } - String resultData = httpResult.getData(); + String resultData = httpResult.getData().toString(); if(data.containsKey("code")){ Object expectCode = data.get("code"); Object actualCode = JsonPath.read(resultData, "$.code"); diff --git a/test/integration-test/openmldb-test-java/openmldb-http-test/src/main/java/com/_4paradigm/openmldb/http_test/common/ClusterTest.java b/test/integration-test/openmldb-test-java/openmldb-http-test/src/main/java/com/_4paradigm/openmldb/http_test/common/ClusterTest.java index b24d2de1652..fe93d1ccb32 100644 --- a/test/integration-test/openmldb-test-java/openmldb-http-test/src/main/java/com/_4paradigm/openmldb/http_test/common/ClusterTest.java +++ b/test/integration-test/openmldb-test-java/openmldb-http-test/src/main/java/com/_4paradigm/openmldb/http_test/common/ClusterTest.java @@ -28,6 +28,8 @@ import org.testng.annotations.BeforeTest; import org.testng.annotations.Optional; import org.testng.annotations.Parameters; +import com._4paradigm.openmldb.test_common.provider.YamlUtil; +import com._4paradigm.openmldb.test_common.util.Tool; @Slf4j public class ClusterTest extends BaseTest{ @@ -52,7 +54,9 @@ public void beforeTest(@Optional("qa") String env, @Optional("main") String vers openMLDBDeploy.setOpenMLDBPath(openMLDBPath); openMLDBDeploy.setCluster(false); RestfulGlobalVar.mainInfo = openMLDBDeploy.deployCluster(2, 3); - } else { + } else if(env.equalsIgnoreCase("deploy")){ + RestfulGlobalVar.mainInfo = YamlUtil.getObject(Tool.openMLDBDir().getAbsolutePath()+"/out/openmldb_info.yaml",OpenMLDBInfo.class); + } else { OpenMLDBInfo openMLDBInfo = new OpenMLDBInfo(); openMLDBInfo.setDeployType(OpenMLDBDeployType.CLUSTER); openMLDBInfo.setNsNum(2); diff --git a/test/integration-test/openmldb-test-java/openmldb-http-test/src/main/java/com/_4paradigm/openmldb/http_test/config/FedbRestfulConfig.java b/test/integration-test/openmldb-test-java/openmldb-http-test/src/main/java/com/_4paradigm/openmldb/http_test/config/FedbRestfulConfig.java index ff216bf3a93..807efaf30b1 100644 --- a/test/integration-test/openmldb-test-java/openmldb-http-test/src/main/java/com/_4paradigm/openmldb/http_test/config/FedbRestfulConfig.java +++ b/test/integration-test/openmldb-test-java/openmldb-http-test/src/main/java/com/_4paradigm/openmldb/http_test/config/FedbRestfulConfig.java @@ -77,7 +77,8 @@ public class FedbRestfulConfig { BASE_PATH = CONFIG.getProperty(RestfulGlobalVar.env + "_base_path"); // BASE_URL = CONFIG.getProperty(RestfulGlobalVar.env + "_base_url"); - DB_NAME = CONFIG.getProperty(RestfulGlobalVar.env + "_db_name"); + // DB_NAME = CONFIG.getProperty(RestfulGlobalVar.env + "_db_name"); + DB_NAME = "test_restful"; log.info("HybridSEConfig: db_name: {}", DB_NAME); String versionStr = System.getProperty("fedbVersion"); if (StringUtils.isEmpty(versionStr)) { diff --git a/test/integration-test/openmldb-test-java/openmldb-http-test/src/main/java/com/_4paradigm/openmldb/http_test/executor/RestfulOnlineExecutor.java b/test/integration-test/openmldb-test-java/openmldb-http-test/src/main/java/com/_4paradigm/openmldb/http_test/executor/RestfulOnlineExecutor.java new file mode 100644 index 00000000000..3145968e1f1 --- /dev/null +++ b/test/integration-test/openmldb-test-java/openmldb-http-test/src/main/java/com/_4paradigm/openmldb/http_test/executor/RestfulOnlineExecutor.java @@ -0,0 +1,179 @@ +package com._4paradigm.openmldb.http_test.executor; + + +import com._4paradigm.openmldb.test_common.restful.common.OpenMLDBHttp; +import com._4paradigm.openmldb.test_common.restful.model.HttpMethod; +import com._4paradigm.openmldb.test_common.model.SQLCase; +import com._4paradigm.openmldb.test_common.model.InputDesc; +import com._4paradigm.openmldb.test_common.common.BaseExecutor; +import com._4paradigm.openmldb.test_common.restful.model.HttpResult; +import com._4paradigm.openmldb.test_common.command.OpenMLDBCommandFactory; +import com._4paradigm.openmldb.http_test.util.HttpUtil; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Random; +import org.testng.Assert; +import org.apache.commons.collections4.CollectionUtils; +import com._4paradigm.openmldb.test_common.command.OpenMLDBCommandUtil; +import com._4paradigm.openmldb.test_common.provider.YamlUtil; +import com._4paradigm.openmldb.test_common.util.Tool; +import com._4paradigm.qa.openmldb_deploy.bean.OpenMLDBInfo; +import com._4paradigm.openmldb.test_common.restful.model.HttpData; +import com.google.gson.Gson; + + +public class RestfulOnlineExecutor extends BaseExecutor { + + public RestfulOnlineExecutor(SQLCase sqlCase) { + super(sqlCase); + } + + protected List tables = sqlCase.getInputs(); + protected HttpResult httpresult; + protected String deploy; + protected OpenMLDBInfo openMLDBInfo= YamlUtil.getObject(Tool.openMLDBDir().getAbsolutePath()+"/out/openmldb_info.yaml",OpenMLDBInfo.class); + protected String defaultDb = null==sqlCase.getDb()||sqlCase.getDb().equals("null") ? "test_apiserver": sqlCase.getDb(); + String apiServerUrl; + OpenMLDBHttp openMLDBHttp = new OpenMLDBHttp(); + + @Override + public boolean verify(){ + if(null != sqlCase.getMode() && sqlCase.getMode().contains("request-unsupport")){ + return false; + } + if(null != sqlCase.getMode() && sqlCase.getMode().contains("cluster-unsupport")){ + return false; + } + if(null != sqlCase.getMode() && sqlCase.getMode().contains("hybridse-only")){ + return false; + } + if(null != sqlCase.getMode() && sqlCase.getMode().contains("apiserver-unsupport")){ + return false; + } + if(null != sqlCase.getMode() && sqlCase.getMode().contains("rtidb-unsupport")){ + return false; + } + if(null != sqlCase.getMode() && sqlCase.getMode().contains("procedure-unsupport")){ + return false; + } + + return true; + } + + @Override + public void prepare() { + dbName = defaultDb; + apiServerUrl = "http://"+openMLDBInfo.getApiServerEndpoints().get(0); + List tables = sqlCase.getInputs(); + OpenMLDBCommandUtil.createDatabases(openMLDBInfo,dbName,tables); + OpenMLDBCommandFactory.runNoInteractive(openMLDBInfo, dbName, "set @@global.execute_mode='online'"); + if (!CollectionUtils.isEmpty(tables)) { + OpenMLDBCommandUtil.createTables(openMLDBInfo, dbName, tables); + for (InputDesc table : tables) { + tableNames.add(table.getName()); + } + } + Random r = new Random(System.currentTimeMillis()); + deploy = tableNames.get(0)+String.valueOf(r.nextInt(1000000)); + String tmpSql = sqlCase.getSql().toString().replaceAll("\'","\""); + String options = sqlCase.getLongWindow()!=null? "OPTIONS(long_windows=\""+sqlCase.getLongWindow()+"\",RANGE_BIAS=\"inf\", ROWS_BIAS=\"inf\") " + :"OPTIONS(RANGE_BIAS=\"inf\", ROWS_BIAS=\"inf\") "; + tmpSql = options + tmpSql; + OpenMLDBCommandFactory.runNoInteractive(openMLDBInfo, dbName, "deploy "+deploy+" "+tmpSql); + String uri = "/dbs/"+dbName+"/deployments/"+deploy; + HttpResult result = openMLDBHttp.restfulJsonRequest(apiServerUrl,uri,"",HttpMethod.GET); + if(result.getData().contains("\"code\":-1")){ + openMLDBResult.setMsg("deploy fail"); + return; + } + + // first table and data set as request, skipped in prepare stage, other tables and data set as base, added + if (tables.size()>1){ + for (int i=1;i0? tables.get(i).getDb(): dbName; + uri = "/dbs/"+curDb+"/tables/"+tableNames.get(i); + OpenMLDBHttp openMLDBHttp = new OpenMLDBHttp(); + openMLDBHttp.restfulJsonRequest(apiServerUrl,uri,body,HttpMethod.PUT); + } + } + } + } + + @Override + @SuppressWarnings("unchecked") + public void execute(){ + if(openMLDBResult.getMsg().equals("deploy fail")){ + openMLDBResult.setOk(false); + return; + } + List> tmpResults = new ArrayList>(); + HttpResult result = new HttpResult(); + + // set row i as request line, insert row i, repeat + for (int i=0;i tmpResult = (List)data.getData().get("data").get(0); + tmpResults.add(tmpResult); + } else {break;} + body = HttpUtil.formatInputs("value",tables.get(0),i,false); + String curDb = tables.get(0).getDb().length()>0? tables.get(0).getDb(): dbName; + uri = "/dbs/"+curDb+"/tables/"+tableNames.get(0); + openMLDBHttp.restfulJsonRequest(apiServerUrl,uri,body,HttpMethod.PUT); + } + + openMLDBResult = HttpUtil.convertHttpResult(sqlCase, result,tmpResults); + } + + @Override + public void check(){ + // success check + if (!sqlCase.getExpect().getSuccess()){ + Assert.assertFalse(openMLDBResult.isOk(),"execute expect fail but success"); + return; + } + // format output + openMLDBResult.setFormattedExpectResults(HttpUtil.FormatOutputs(sqlCase.getExpect().getRows(),sqlCase.getExpect().getColumns())); + openMLDBResult.setFormattedActualResults(HttpUtil.FormatOutputs(openMLDBResult.getResult(),openMLDBResult.getColumnTypes())); + + // size check + //Assert.assertEquals(openMLDBResult.isOk(),sqlCase.getExpect().getSuccess().booleanValue(),"errror "+openMLDBResult.getMsg() ); + Assert.assertEquals(openMLDBResult.getFormattedActualResults().size(),openMLDBResult.getFormattedExpectResults().size()); + + // contents check + for (int i =0;i realColumn = openMLDBResult.getFormattedActualResults().get(i); + Map expectColumn = openMLDBResult.getFormattedExpectResults().get(i); + expectColumn.forEach((k,v)-> { + Assert.assertTrue(realColumn.containsKey(k), "column "+k+"don't exist"); + String errorMessage = String.format("key %s mismatch in case %s", k,sqlCase.getDesc().toString()); + if (v==null){ + Assert.assertNull(realColumn.get(k), errorMessage); + } else if (v instanceof Float ){ + Assert.assertEquals((float)realColumn.get(k),(float)v,1e-4,errorMessage); + } else if (v instanceof Double){ + Assert.assertEquals((double)realColumn.get(k),(double)v,1e-4,errorMessage); + } else { + Assert.assertEquals(realColumn.get(k),v,errorMessage); + } + }); + } + } + + @Override + public void tearDown(){ + OpenMLDBCommandFactory.runNoInteractive(openMLDBInfo, dbName, "drop deployment "+deploy); + for(InputDesc table:sqlCase.getInputs()){ + String curDb = table.getDb().length()>0? table.getDb(): dbName; + OpenMLDBCommandFactory.runNoInteractive(openMLDBInfo, curDb, "drop table "+table.getName()); + } + } + +} diff --git a/test/integration-test/openmldb-test-java/openmldb-http-test/src/main/java/com/_4paradigm/openmldb/http_test/util/HttpUtil.java b/test/integration-test/openmldb-test-java/openmldb-http-test/src/main/java/com/_4paradigm/openmldb/http_test/util/HttpUtil.java new file mode 100644 index 00000000000..46c44090484 --- /dev/null +++ b/test/integration-test/openmldb-test-java/openmldb-http-test/src/main/java/com/_4paradigm/openmldb/http_test/util/HttpUtil.java @@ -0,0 +1,206 @@ +/* + * Copyright 2021 4Paradigm + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com._4paradigm.openmldb.http_test.util; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Collections; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; +import com._4paradigm.openmldb.test_common.model.InputDesc; +import com._4paradigm.openmldb.test_common.bean.OpenMLDBResult; +import com._4paradigm.openmldb.test_common.model.SQLCase; +import com._4paradigm.openmldb.test_common.restful.model.HttpResult; +import lombok.extern.slf4j.Slf4j; +import com._4paradigm.openmldb.test_common.restful.model.HttpData; + +@Slf4j +public class HttpUtil { + public static String formatInputs(String key, InputDesc table,int cur, boolean need_schema){ + String body = ""; + Gson gson = new GsonBuilder().create(); + + List> rows = table.getRows(); + List> tmpRow = new ArrayList>(); + Map data = new HashMap<>(); + tmpRow.add(rows.get(cur)); + // for(List row : rows) { + // tmpRow.add(row); + // } + data.put(key, tmpRow); + if (need_schema){ + data.put("need_schema", true); + } + body = gson.toJson(data); + + return body; + } + + + public static List> FormatOutputs(List> rows,List columns){ + String[] nameType; + Object o; + List> resultLists = new ArrayList>(); + List resultList = new ArrayList(); + if (rows.equals(null)||rows==null||rows.size()==0){return resultLists;} + Collections.sort(rows, new RowsSort(0)); + for (int i=0;i tmp = new HashMap<>(); + for (int j=0;j httpData = data.getData().get("data"); + List> resultList = new ArrayList>(); + for (Object o : httpData) { + resultList.add((List)o); + } + if (resultList.size()>0) { + openMLDBResult.setResult(resultList); + } + + httpData = data.getData().get("schema"); + List schemaList = new ArrayList(); + for (int i = 0 ; i mp = (Map)httpData.get(i); + schemaList.add(mp.get("name")+" "+mp.get("type")); + } + openMLDBResult.setColumnTypes(schemaList); + } catch (Exception e) { + // + } + } else { + openMLDBResult.setOk(false); + // Assert.assertNotEquals(0, httpResult.getData().getCode()); + } + return openMLDBResult; + } + + @SuppressWarnings("unchecked") + public static OpenMLDBResult convertHttpResult(SQLCase sqlCase, HttpResult httpResult, List> tmpResults){ + OpenMLDBResult openMLDBResult = new OpenMLDBResult(); + Gson gson = new Gson(); + HttpData data = gson.fromJson(httpResult.getData(), HttpData.class); + openMLDBResult.setMsg(sqlCase.getDesc()+data.getMsg()); + if (httpResult.getHttpCode()!=200 || data.getCode()!=0 ){ + openMLDBResult.setOk(false); + return openMLDBResult; + } + try { + List httpData = data.getData().get("data"); + openMLDBResult.setResult(tmpResults); + httpData = data.getData().get("schema"); + List schemaList = new ArrayList(); + for (int i = 0 ; i mp = (Map)httpData.get(i); + schemaList.add(mp.get("name").replace(" ", "")+" "+mp.get("type")); + } + + openMLDBResult.setColumnTypes(schemaList); + openMLDBResult.setOk(true); + } catch (Exception e) { + e.printStackTrace(); + log.info( "erro msg is "+data.getMsg()); + openMLDBResult.setOk(false); + } + return openMLDBResult; + } + + public static String switchType(String type){ + switch(type){ + case "smallint": + return "int"; + case "int": + return "int"; + case "bigint": + return "long"; + case "int32": + return "int"; + case "int16": + return "int"; + case "int64": + return "long"; + case "float": + return "float"; + default: + return type; + } + + } + +} diff --git a/test/integration-test/openmldb-test-java/openmldb-http-test/src/main/java/com/_4paradigm/openmldb/http_test/util/RowsSort.java b/test/integration-test/openmldb-test-java/openmldb-http-test/src/main/java/com/_4paradigm/openmldb/http_test/util/RowsSort.java new file mode 100644 index 00000000000..3fd7b19d283 --- /dev/null +++ b/test/integration-test/openmldb-test-java/openmldb-http-test/src/main/java/com/_4paradigm/openmldb/http_test/util/RowsSort.java @@ -0,0 +1,61 @@ +/* + * Copyright 2021 4Paradigm + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com._4paradigm.openmldb.http_test.util; + +import com._4paradigm.openmldb.test_common.common.ReportLog; +import lombok.extern.slf4j.Slf4j; + +import java.util.Comparator; +import java.util.List; + + +@Slf4j +public class RowsSort implements Comparator { + private int index; + private ReportLog reportLog = ReportLog.of(); + + public RowsSort(int index) { + this.index = index; + if (-1 == index) { + log.warn("compare without index"); + reportLog.warn("compare without index"); + } + } + + @Override + public int compare(List o1, List o2) { + if (-1 == index) { + + return 0; + } + Object obj1 = o1.get(index); + Object obj2 = o2.get(index); + if (obj1 == obj2) { + return 0; + } + if (obj1 == null) { + return -1; + } + if (obj2 == null) { + return 1; + } + if (obj1 instanceof Comparable && obj2 instanceof Comparable) { + return ((Comparable) obj1).compareTo(obj2); + } else { + return obj1.hashCode() - obj2.hashCode(); + } + } +} diff --git a/test/integration-test/openmldb-test-java/openmldb-http-test/src/test/java/com/_4paradigm/openmldb/http_test/v030/TestDeployment.java b/test/integration-test/openmldb-test-java/openmldb-http-test/src/test/java/com/_4paradigm/openmldb/http_test/tmp/TestCluster.java similarity index 52% rename from test/integration-test/openmldb-test-java/openmldb-http-test/src/test/java/com/_4paradigm/openmldb/http_test/v030/TestDeployment.java rename to test/integration-test/openmldb-test-java/openmldb-http-test/src/test/java/com/_4paradigm/openmldb/http_test/tmp/TestCluster.java index b50b64e4302..344c2fdd285 100644 --- a/test/integration-test/openmldb-test-java/openmldb-http-test/src/test/java/com/_4paradigm/openmldb/http_test/v030/TestDeployment.java +++ b/test/integration-test/openmldb-test-java/openmldb-http-test/src/test/java/com/_4paradigm/openmldb/http_test/tmp/TestCluster.java @@ -13,25 +13,24 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com._4paradigm.openmldb.http_test.v030; +package com._4paradigm.openmldb.http_test.tmp; -import com._4paradigm.openmldb.http_test.common.ClusterTest; -import com._4paradigm.openmldb.http_test.common.StandaloneTest; -import com._4paradigm.openmldb.http_test.executor.RestfulCliExecutor; -import com._4paradigm.openmldb.http_test.executor.RestfulExecutor; +import com._4paradigm.openmldb.http_test.executor.RestfulOnlineExecutor; import com._4paradigm.openmldb.test_common.provider.Yaml; -import com._4paradigm.openmldb.test_common.restful.model.RestfulCase; import io.qameta.allure.Feature; import io.qameta.allure.Story; import org.testng.annotations.Test; +import com._4paradigm.openmldb.test_common.common.BaseTest; +import com._4paradigm.openmldb.test_common.model.SQLCase; -@Feature("deployment") -public class TestDeployment extends StandaloneTest { - - @Test(dataProvider = "getCase") - @Yaml(filePaths = "/restful/v030/test_execute_deployment.yaml") - @Story("ExecuteDeployment") - public void testExecute(RestfulCase restfulCase){ - new RestfulCliExecutor(restfulCase).run(); +@Feature("cluster api request") +public class TestCluster { + @Test(dataProvider = "getCase",dataProviderClass = BaseTest.class) + @Yaml(filePaths = "integration_test/cluster/") + @Story("cluster api request") + public void testBatch(SQLCase sqlCase){ + new RestfulOnlineExecutor(sqlCase).run(); } -} + + +} \ No newline at end of file diff --git a/test/integration-test/openmldb-test-java/openmldb-http-test/src/test/java/com/_4paradigm/openmldb/http_test/tmp/TestDropTable.java b/test/integration-test/openmldb-test-java/openmldb-http-test/src/test/java/com/_4paradigm/openmldb/http_test/tmp/TestDropTable.java deleted file mode 100644 index 7d947bf4bb6..00000000000 --- a/test/integration-test/openmldb-test-java/openmldb-http-test/src/test/java/com/_4paradigm/openmldb/http_test/tmp/TestDropTable.java +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright 2021 4Paradigm - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com._4paradigm.openmldb.http_test.tmp; - -import com._4paradigm.openmldb.test_common.openmldb.OpenMLDBClient; -import com._4paradigm.openmldb.test_common.bean.OpenMLDBResult; -import com._4paradigm.openmldb.test_common.util.SDKUtil; -import com._4paradigm.openmldb.test_common.restful.model.HttpResult; -import com._4paradigm.openmldb.test_common.util.HttpRequest; -import com.google.gson.Gson; -import com.google.gson.JsonArray; -import com.google.gson.JsonObject; -import com.google.gson.JsonParser; -import org.testng.annotations.Test; - -import java.util.HashMap; - -public class TestDropTable { - - @Test - public void testAll() throws Exception { - OpenMLDBClient fedbClient = new OpenMLDBClient("172.24.4.55:10000","/fedb"); - String apiserver = "172.24.4.55:20000"; - String dbName = "test_zw"; - String url = String.format("http://%s/dbs/%s/tables",apiserver,dbName); - HttpResult httpResult = HttpRequest.get(url, null, new HashMap<>()); -// System.out.println(httpResult.getData()); - Gson gson = new Gson(); - JsonParser parser = new JsonParser(); - JsonObject jsonObject = parser.parse(httpResult.getData()).getAsJsonObject(); - JsonArray tables = jsonObject.getAsJsonArray("tables"); - for(int i=0;i - - - + + + - - - + - - - - - + + + + + + + + - + + \ No newline at end of file diff --git a/test/integration-test/openmldb-test-java/openmldb-sdk-test/src/main/java/com/_4paradigm/openmldb/java_sdk_test/checker/OptionsChecker.java b/test/integration-test/openmldb-test-java/openmldb-sdk-test/src/main/java/com/_4paradigm/openmldb/java_sdk_test/checker/OptionsChecker.java index d903425918b..9b3b04958e1 100644 --- a/test/integration-test/openmldb-test-java/openmldb-sdk-test/src/main/java/com/_4paradigm/openmldb/java_sdk_test/checker/OptionsChecker.java +++ b/test/integration-test/openmldb-test-java/openmldb-sdk-test/src/main/java/com/_4paradigm/openmldb/java_sdk_test/checker/OptionsChecker.java @@ -54,7 +54,7 @@ public void check() throws Exception { String url = String.format("http://%s/dbs/%s/tables/%s",apiserverEndpoint,dbName,tableName); Tool.sleep(3000); HttpResult httpResult = HttpRequest.get(url); - String resultData = httpResult.getData(); + String resultData = httpResult.getData().toString(); Object partitionNum = JsonPath.read(resultData, "$.table.partition_num"); Object replicaNum = JsonPath.read(resultData, "$.table.replica_num"); Map options = expect.getOptions(); diff --git a/test/integration-test/openmldb-test-java/openmldb-sdk-test/src/main/java/com/_4paradigm/openmldb/java_sdk_test/executor/BaseExecutor.java b/test/integration-test/openmldb-test-java/openmldb-sdk-test/src/main/java/com/_4paradigm/openmldb/java_sdk_test/executor/BaseExecutor.java index adc918aa947..3b8bc0608aa 100644 --- a/test/integration-test/openmldb-test-java/openmldb-sdk-test/src/main/java/com/_4paradigm/openmldb/java_sdk_test/executor/BaseExecutor.java +++ b/test/integration-test/openmldb-test-java/openmldb-sdk-test/src/main/java/com/_4paradigm/openmldb/java_sdk_test/executor/BaseExecutor.java @@ -43,7 +43,6 @@ public void run() { String className = Thread.currentThread().getStackTrace()[2].getClassName(); String methodName = Thread.currentThread().getStackTrace()[2].getMethodName(); System.out.println(className+"."+methodName+":"+ sqlCase.getCaseFileName()+":"+ sqlCase.getDesc() + " Begin!"); - log.info(className+"."+methodName+":"+ sqlCase.getDesc() + " Begin!"); boolean verify = false; try { verify = verify(); @@ -58,7 +57,7 @@ public void run() { } catch (Exception e) { e.printStackTrace(); System.out.println(className+"."+methodName+":"+ sqlCase.getDesc() + " FAIL!"); - Assert.fail("executor run with exception"); + Assert.fail("executor run with exception "+sqlCase.getDesc()); }finally { if(verify) { tearDown(); diff --git a/test/integration-test/openmldb-test-java/openmldb-sdk-test/src/main/java/com/_4paradigm/openmldb/java_sdk_test/executor/BaseSQLExecutor.java b/test/integration-test/openmldb-test-java/openmldb-sdk-test/src/main/java/com/_4paradigm/openmldb/java_sdk_test/executor/BaseSQLExecutor.java index a821e305ede..2c46a344b65 100644 --- a/test/integration-test/openmldb-test-java/openmldb-sdk-test/src/main/java/com/_4paradigm/openmldb/java_sdk_test/executor/BaseSQLExecutor.java +++ b/test/integration-test/openmldb-test-java/openmldb-sdk-test/src/main/java/com/_4paradigm/openmldb/java_sdk_test/executor/BaseSQLExecutor.java @@ -120,7 +120,6 @@ public void tearDown() { public void tearDown(String version,SqlExecutor executor) { - log.info("version:{},begin tear down",version); List tearDown = sqlCase.getTearDown(); if(CollectionUtils.isNotEmpty(tearDown)){ tearDown.forEach(sql->{ @@ -132,7 +131,6 @@ public void tearDown(String version,SqlExecutor executor) { SDKUtil.sql(executor, dbName, sql); }); } - log.info("version:{},begin drop table",version); List tables = sqlCase.getInputs(); if (CollectionUtils.isEmpty(tables)) { return; diff --git a/test/integration-test/openmldb-test-java/openmldb-sdk-test/src/main/java/com/_4paradigm/openmldb/java_sdk_test/executor/BatchSQLExecutor.java b/test/integration-test/openmldb-test-java/openmldb-sdk-test/src/main/java/com/_4paradigm/openmldb/java_sdk_test/executor/BatchSQLExecutor.java index da0649a4190..cdb6bbe27e3 100644 --- a/test/integration-test/openmldb-test-java/openmldb-sdk-test/src/main/java/com/_4paradigm/openmldb/java_sdk_test/executor/BatchSQLExecutor.java +++ b/test/integration-test/openmldb-test-java/openmldb-sdk-test/src/main/java/com/_4paradigm/openmldb/java_sdk_test/executor/BatchSQLExecutor.java @@ -82,7 +82,7 @@ public boolean verify() { @Override public void prepare(String version,SqlExecutor executor){ - log.info("version:{} prepare begin",version); + sdkClient.createAndUseDB(dbName); sdkClient.setOnline(); boolean useFirstInputAsRequests = false; @@ -119,12 +119,10 @@ public void prepare(String version,SqlExecutor executor){ } } - log.info("version:{} prepare end",version); } @Override public OpenMLDBResult execute(String version, SqlExecutor executor){ - log.info("version:{} execute begin",version); sdkClient.useDB(dbName); OpenMLDBResult openMLDBResult = null; List sqls = sqlCase.getSqls(); @@ -150,7 +148,6 @@ public OpenMLDBResult execute(String version, SqlExecutor executor){ // openMLDBResult = SDKUtil.sql(executor, dbName, sql); openMLDBResult = sdkClient.execute(sql); } - log.info("version:{} execute end",version); return openMLDBResult; } } diff --git a/test/integration-test/openmldb-test-java/openmldb-sdk-test/src/main/java/com/_4paradigm/openmldb/java_sdk_test/executor/OfflineJobExecuter.java b/test/integration-test/openmldb-test-java/openmldb-sdk-test/src/main/java/com/_4paradigm/openmldb/java_sdk_test/executor/OfflineJobExecuter.java index a502b7746f0..99e00dba4bc 100644 --- a/test/integration-test/openmldb-test-java/openmldb-sdk-test/src/main/java/com/_4paradigm/openmldb/java_sdk_test/executor/OfflineJobExecuter.java +++ b/test/integration-test/openmldb-test-java/openmldb-sdk-test/src/main/java/com/_4paradigm/openmldb/java_sdk_test/executor/OfflineJobExecuter.java @@ -123,7 +123,12 @@ public void prepare(String version,SqlExecutor executor){ // write inputs to csv file String filePath = offlineDataPrefix+tableName+ ".csv"; - ExecUtil.exeCommand("touch "+filePath); + try { + ExecUtil.exeCommand("touch "+filePath); + } catch (Exception e) { + // TODO: handle exception + } + try { BufferedWriter bufferedWriter = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(filePath), "UTF-8")); diff --git a/test/integration-test/openmldb-test-java/openmldb-sdk-test/src/main/java/com/_4paradigm/openmldb/java_sdk_test/executor/RequestQuerySQLExecutor.java b/test/integration-test/openmldb-test-java/openmldb-sdk-test/src/main/java/com/_4paradigm/openmldb/java_sdk_test/executor/RequestQuerySQLExecutor.java index 713cbf6a424..51f19b95e15 100644 --- a/test/integration-test/openmldb-test-java/openmldb-sdk-test/src/main/java/com/_4paradigm/openmldb/java_sdk_test/executor/RequestQuerySQLExecutor.java +++ b/test/integration-test/openmldb-test-java/openmldb-sdk-test/src/main/java/com/_4paradigm/openmldb/java_sdk_test/executor/RequestQuerySQLExecutor.java @@ -155,6 +155,10 @@ public boolean verify() { log.info("skip case in disk mode: {}", sqlCase.getDesc()); return false; } + if (null != sqlCase.getMode() && sqlCase.getMode().contains("procedure-unsupport")) { + log.info("skip case in procedure mode: {}", sqlCase.getDesc()); + return false; + } if (OpenMLDBConfig.isCluster() && null != sqlCase.getMode() && sqlCase.getMode().contains("cluster-unsupport")) { log.info("cluster-unsupport, skip case in cluster request mode: {}", sqlCase.getDesc()); diff --git a/test/integration-test/openmldb-test-java/openmldb-sdk-test/src/test/java/com/_4paradigm/openmldb/java_sdk_test/cluster/high_availability/HighDiskTableTest.java b/test/integration-test/openmldb-test-java/openmldb-sdk-test/src/test/java/com/_4paradigm/openmldb/java_sdk_test/cluster/high_availability/HighDiskTableTest.java index 63f1ca2e300..9b778bf2b9d 100644 --- a/test/integration-test/openmldb-test-java/openmldb-sdk-test/src/test/java/com/_4paradigm/openmldb/java_sdk_test/cluster/high_availability/HighDiskTableTest.java +++ b/test/integration-test/openmldb-test-java/openmldb-sdk-test/src/test/java/com/_4paradigm/openmldb/java_sdk_test/cluster/high_availability/HighDiskTableTest.java @@ -79,7 +79,6 @@ public static void insert10000(Statement statement,String tableName,Long lastTim } i++; } - log.info("stop stop stop"); } diff --git a/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/bean/OpenMLDBResult.java b/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/bean/OpenMLDBResult.java index 83c9a748704..21dcfe1d5e9 100644 --- a/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/bean/OpenMLDBResult.java +++ b/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/bean/OpenMLDBResult.java @@ -22,6 +22,7 @@ import org.apache.commons.collections4.CollectionUtils; import java.util.List; +import java.util.Map; /** * @author zhaowei @@ -47,6 +48,9 @@ public class OpenMLDBResult { private OpenMLDBJob openMLDBJob; private List offlineColumns; private List> offlineResult; + private List> formattedExpectResults; + private List> formattedActualResults; + @Override public String toString() { diff --git a/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/command/OpenMLDBCommandUtil.java b/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/command/OpenMLDBCommandUtil.java index affcfd98c19..b2549788015 100644 --- a/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/command/OpenMLDBCommandUtil.java +++ b/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/command/OpenMLDBCommandUtil.java @@ -34,21 +34,16 @@ public class OpenMLDBCommandUtil { private static final Logger logger = new LogProxy(log); public static OpenMLDBResult createDB(OpenMLDBInfo openMLDBInfo, String dbName) { - String sql = String.format("create database %s ;",dbName); + String sql = String.format("create database if not exists %s ;",dbName); OpenMLDBResult openMLDBResult = OpenMLDBCommandFacade.sql(openMLDBInfo,dbName,sql); return openMLDBResult; } - public static OpenMLDBResult desc(OpenMLDBInfo openMLDBInfo, String dbName, String tableName) { - String sql = String.format("desc %s ;",tableName); - OpenMLDBResult openMLDBResult = OpenMLDBCommandFacade.sql(openMLDBInfo,dbName,sql); - return openMLDBResult; - } - - public static OpenMLDBResult createAndInsert(OpenMLDBInfo openMLDBInfo, String defaultDBName, List inputs) { + public static void createDatabases(OpenMLDBInfo openMLDBInfo, String defaultDBName, List inputs) { HashSet dbNames = new HashSet<>(); if (StringUtils.isNotEmpty(defaultDBName)) { dbNames.add(defaultDBName); + OpenMLDBResult createDBResult = createDB(openMLDBInfo,defaultDBName); } if (!Objects.isNull(inputs)) { for (InputDesc input : inputs) { @@ -60,6 +55,9 @@ public static OpenMLDBResult createAndInsert(OpenMLDBInfo openMLDBInfo, String d } } } + } + + public static OpenMLDBResult createTables(OpenMLDBInfo openMLDBInfo, String defaultDBName, List inputs) { OpenMLDBResult openMLDBResult = new OpenMLDBResult(); if (inputs != null && inputs.size() > 0) { for (int i = 0; i < inputs.size(); i++) { @@ -71,13 +69,27 @@ public static OpenMLDBResult createAndInsert(OpenMLDBInfo openMLDBInfo, String d createSql = SQLCase.formatSql(createSql, i, tableName); createSql = SQLUtil.formatSql(createSql, openMLDBInfo); if (!createSql.isEmpty()) { - OpenMLDBResult res = OpenMLDBCommandFacade.sql(openMLDBInfo,dbName,createSql); - if (!res.isOk()) { - logger.error("fail to create table"); - // reportLog.error("fail to create table"); - return res; - } + openMLDBResult = OpenMLDBCommandFacade.sql(openMLDBInfo,dbName,createSql); } + } + } + return openMLDBResult; + } + + public static OpenMLDBResult desc(OpenMLDBInfo openMLDBInfo, String dbName, String tableName) { + String sql = String.format("desc %s ;",tableName); + OpenMLDBResult openMLDBResult = OpenMLDBCommandFacade.sql(openMLDBInfo,dbName,sql); + return openMLDBResult; + } + + public static OpenMLDBResult createAndInsert(OpenMLDBInfo openMLDBInfo, String defaultDBName, List inputs) { + createDatabases(openMLDBInfo,defaultDBName,inputs); + createTables(openMLDBInfo,defaultDBName,inputs); + OpenMLDBResult openMLDBResult = new OpenMLDBResult(); + if (inputs != null && inputs.size() > 0) { + for (int i = 0; i < inputs.size(); i++) { + InputDesc inputDesc = inputs.get(i); + String dbName = inputDesc.getDb().isEmpty() ? defaultDBName : inputDesc.getDb(); InputDesc input = inputs.get(i); List inserts = input.extractInserts(); for (String insertSql : inserts) { diff --git a/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/common/BaseExecutor.java b/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/common/BaseExecutor.java new file mode 100644 index 00000000000..729b46ddfd2 --- /dev/null +++ b/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/common/BaseExecutor.java @@ -0,0 +1,72 @@ +/* + * Copyright 2021 4Paradigm + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com._4paradigm.openmldb.test_common.common; + + +import com._4paradigm.openmldb.test_common.bean.OpenMLDBResult; +import com._4paradigm.openmldb.test_common.model.SQLCase; +import com._4paradigm.openmldb.test_common.model.SQLCaseType; +import lombok.extern.slf4j.Slf4j; +import org.testng.Assert; +import org.testng.collections.Lists; +import com._4paradigm.openmldb.test_common.common.IExecutor; + +import java.util.List; + +/** + * @author zhaowei + * @date 2021/3/13 10:18 AM + */ +@Slf4j +public abstract class BaseExecutor implements IExecutor{ +// protected static final log log = new LogProxy(log); + protected SQLCase sqlCase; + protected String dbName; + protected List tableNames = Lists.newArrayList(); + protected OpenMLDBResult openMLDBResult = new OpenMLDBResult(); + + public BaseExecutor(SQLCase sqlCase){ + this.sqlCase=sqlCase; + } + @Override + public void run() { + String className = Thread.currentThread().getStackTrace()[2].getClassName(); + String methodName = Thread.currentThread().getStackTrace()[2].getMethodName(); + System.out.println(className+"."+methodName+":"+ sqlCase.getCaseFileName()+":"+ sqlCase.getDesc() + " Begin!"); + log.info(className+"."+methodName+":"+ sqlCase.getDesc() + " Begin!"); + boolean verify = false; + try { + verify = verify(); + if(!verify) return; + if (null == sqlCase) { + Assert.fail("executor run with null case"); + return; + } + prepare(); + execute(); + check(); + } catch (Exception e) { + e.printStackTrace(); + System.out.println(className+"."+methodName+":"+ sqlCase.getDesc() + " FAIL!"); + Assert.fail("executor run with exception"+sqlCase.getDesc()); + }finally { + if(verify) { + tearDown(); + } + System.out.println(className+"."+methodName+":"+ sqlCase.getDesc() + " DONE!"); + } + } +} diff --git a/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/common/BaseTest.java b/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/common/BaseTest.java index b2dfe006698..3a4222946d3 100644 --- a/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/common/BaseTest.java +++ b/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/common/BaseTest.java @@ -22,7 +22,6 @@ import com._4paradigm.openmldb.test_common.openmldb.OpenMLDBGlobalVar; import com._4paradigm.openmldb.test_common.provider.Yaml; import lombok.extern.slf4j.Slf4j; -import org.slf4j.Logger; import org.testng.Assert; import org.testng.ITest; import org.testng.annotations.BeforeMethod; @@ -31,13 +30,9 @@ import java.io.FileNotFoundException; import java.lang.reflect.Method; -/** - * @author zhaowei - * @date 2021/3/12 7:52 AM - */ + @Slf4j public class BaseTest implements ITest { -// protected static final Logger logger = new LogProxy(log); private ThreadLocal testName = new ThreadLocal<>(); private int testNum = 0; diff --git a/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/openmldb/SDKClient.java b/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/openmldb/SDKClient.java index 9ac964cf8e0..cf192539a73 100644 --- a/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/openmldb/SDKClient.java +++ b/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/openmldb/SDKClient.java @@ -25,7 +25,7 @@ public static SDKClient of(SqlExecutor executor){ return new SDKClient(executor); } public OpenMLDBResult execute(String sql) { - log.info("execute sql:{}",sql); + log.info("execute sql:{}",sql.replaceAll("\\n", "\\r")); OpenMLDBResult openMLDBResult = new OpenMLDBResult(); openMLDBResult.setSql(sql); try { @@ -50,14 +50,14 @@ public OpenMLDBResult execute(String sql) { } } if(sql.toLowerCase().startsWith("create index")||sql.toLowerCase().startsWith("drop index")){ - Tool.sleep(20*1000); + Tool.sleep(10*1000); } } catch (SQLException e) { openMLDBResult.setOk(false); openMLDBResult.setMsg(e.getMessage()); e.printStackTrace(); } - log.info("openMLDBResult:{}",openMLDBResult); + log.debug("openMLDBResult:{}",openMLDBResult); return openMLDBResult; } public OpenMLDBResult execute(List sqlList) { @@ -117,7 +117,7 @@ public void createAndUseDB(String dbName){ List sqlList = new ArrayList<>(); if(!OpenMLDBGlobalVar.CREATE_DB_NAMES.contains(dbName)){ if (!SDKUtil.dbIsExist(statement,dbName)) { - sqlList.add(String.format("create database %s;", dbName)); + sqlList.add(String.format("create database if not exists %s;", dbName)); OpenMLDBGlobalVar.CREATE_DB_NAMES.add(dbName); } } diff --git a/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/restful/common/OpenMLDBHttp.java b/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/restful/common/OpenMLDBHttp.java index 1016deb01e9..3503ef22ca3 100644 --- a/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/restful/common/OpenMLDBHttp.java +++ b/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/restful/common/OpenMLDBHttp.java @@ -96,4 +96,29 @@ public HttpResult restfulRequest() { } return result; } + + public HttpResult restfulJsonRequest(String url, String uri, String body, HttpMethod HttpMethod){ + String realUrl = url+uri; + HttpResult result = null; + this.headMap.put("Content-Type","application/json;charset=utf-8"); + try { + switch(HttpMethod){ + case GET: + result = HttpRequest.get(realUrl,null,this.headMap); + break; + case POST: + result = HttpRequest.postJson(realUrl,body,this.headMap); + break; + case PUT: + result = HttpRequest.put(realUrl,body,this.headMap); + break; + case DELETE: + result = HttpRequest.get(realUrl,this.data,this.headMap); + break; + } + } catch (Exception e){ + e.printStackTrace(); + } + return result; + } } diff --git a/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/restful/model/HttpData.java b/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/restful/model/HttpData.java new file mode 100644 index 00000000000..25fb525c205 --- /dev/null +++ b/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/restful/model/HttpData.java @@ -0,0 +1,14 @@ +package com._4paradigm.openmldb.test_common.restful.model; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import lombok.Data; + +@Data +public class HttpData { + private Map> data; + private Integer code; + private String msg; + +} diff --git a/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/util/HttpRequest.java b/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/util/HttpRequest.java index 605423446ce..bb7ccb4052c 100644 --- a/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/util/HttpRequest.java +++ b/test/integration-test/openmldb-test-java/openmldb-test-common/src/main/java/com/_4paradigm/openmldb/test_common/util/HttpRequest.java @@ -15,7 +15,10 @@ */ package com._4paradigm.openmldb.test_common.util; +import com._4paradigm.openmldb.test_common.restful.model.HttpData; import com._4paradigm.openmldb.test_common.restful.model.HttpResult; +import com.google.gson.Gson; + import lombok.extern.slf4j.Slf4j; import net.minidev.json.JSONObject; import org.apache.commons.collections4.MapUtils; @@ -58,7 +61,7 @@ public static HttpResult get(String link, Map dataMap, Map dataMap, Map dataMap, Map headMap) throws Exception { - log.info("请求的url:" + link); - log.info("请求的data:" + mapToJson(dataMap)); + log.info("request url:" + link + " data: "+mapToJson(dataMap)); CookieStore cookieStore = new BasicCookieStore(); CloseableHttpClient httpclient = HttpClients.custom().setDefaultCookieStore(cookieStore).build(); HttpPost httpost = new HttpPost(link.toString()); @@ -98,13 +99,11 @@ public static HttpResult postJson(String link, Map dataMap, Map< HttpEntity entity = response.getEntity(); // 把内容转成字符串 String resultString = EntityUtils.toString(entity); - log.info("请求的返回code:" + code); - log.info("请求的返回data:" + resultString); + log.info("response code:" + code + " data:"+ resultString); return getHttpResult(resultString, code,headers,cookies,beginTime,endTime); } public static HttpResult postJson(String link, String json, Map headMap) throws Exception { - log.info("请求的url:" + link); - log.info("请求的data:" + json); + log.info("request url:" + link + " data: "+json); CookieStore cookieStore = new BasicCookieStore(); CloseableHttpClient httpclient = HttpClients.custom().setDefaultCookieStore(cookieStore).build(); HttpPost httpPost = new HttpPost(link.toString()); @@ -124,13 +123,11 @@ public static HttpResult postJson(String link, String json, Map HttpEntity entity = response.getEntity(); // 把内容转成字符串 String resultString = EntityUtils.toString(entity); - log.info("请求的返回code:" + code); - log.info("请求的返回data:" + resultString); + log.info("response code:" + code + " data:"+ resultString); return getHttpResult(resultString, code,headers,cookies,beginTime,endTime); } public static HttpResult put(String link, String json, Map headMap) throws IOException { - log.info("请求的url:" + link); - log.info("请求的data:" + json); + log.info("request url:" + link + " data: "+json); CookieStore cookieStore = new BasicCookieStore(); CloseableHttpClient httpclient = HttpClients.custom().setDefaultCookieStore(cookieStore).build(); HttpPut httpPut = new HttpPut(link.toString()); @@ -148,8 +145,7 @@ public static HttpResult put(String link, String json, Map headM HttpEntity entity = response.getEntity(); // 把内容转成字符串 String resultString = EntityUtils.toString(entity); - log.info("请求的返回code:" + code); - log.info("请求的返回data:" + resultString); + log.info("response code:" + code + " data:"+ resultString); return getHttpResult(resultString, code,headers,cookies,beiginTime,endTime); } public static String uploadFile(String filename, String url) throws IOException { @@ -287,8 +283,7 @@ public static HttpResult post(String link, Map parameterMap,Map< } public static HttpResult put(String link, Map dataMap, Map headMap) throws IOException { - log.info("请求的url:" + link); - log.info("请求的data:" + mapToJson(dataMap)); + log.info("request url:" + link + " data: "+mapToJson(dataMap)); CookieStore cookieStore = new BasicCookieStore(); CloseableHttpClient httpclient = HttpClients.custom().setDefaultCookieStore(cookieStore).build(); HttpPut httpput = new HttpPut(link.toString()); @@ -305,8 +300,7 @@ public static HttpResult put(String link, Map dataMap, Map.*" | head -1 | sed 's#.*\(.*\).*#\1#') - sh test/steps/build-java-sdk.sh fi echo "JAVA_SDK_VERSION:${JAVA_SDK_VERSION}" echo "OPENMLDB_SERVER_VERSION:${OPENMLDB_SERVER_VERSION}" diff --git a/test/steps/openmldb-integration-test.sh b/test/steps/openmldb-integration-test.sh new file mode 100755 index 00000000000..e6e95557f41 --- /dev/null +++ b/test/steps/openmldb-integration-test.sh @@ -0,0 +1,59 @@ +#!/bin/bash +while getopts ":c:d:l:s:j:m:" opt +do + case $opt in + c) + echo "参数c的值:$OPTARG" + CASE_XML=$OPTARG + ;; + d) + echo "参数d的值:$OPTARG" + DEPLOY_MODE=$OPTARG + ;; + l) echo "参数l的值:$OPTARG" + CASE_LEVEL=$OPTARG + ;; + s) echo "参数s的值:$OPTARG" + TABLE_STORAGE_MODE=$OPTARG + ;; + j) echo "参数j的值:$OPTARG" + JAR_VERSION=$OPTARG + ;; + m) echo "参数m的值:$OPTARG" + EXECUTE_MODE=$OPTARG + ;; + ?) echo "未知参数" + exit 1 + ;; + esac +done +if [[ "${CASE_XML}" == "" ]]; then + CASE_XML="test_all.xml" +fi +if [[ "${DEPLOY_MODE}" == "" ]]; then + DEPLOY_MODE="cluster" +fi +if [[ "${CASE_LEVEL}" == "" ]]; then + CASE_LEVEL="0" +fi +if [[ "${EXECUTE_MODE}" == "" ]]; then + EXECUTE_MODE="javasdk" +fi + +#JAVA_SDK_VERSION=$(more java/pom.xml | grep ".*" | head -1 | sed 's#.*\(.*\).*#\1#') +sh test/steps/modify_java_sdk_config.sh "${CASE_XML}" "${DEPLOY_MODE}" "${JAR_VERSION}" "" "${JAR_VERSION}" "${JAR_VERSION}" "${TABLE_STORAGE_MODE}" +mkdir -p ../mvnrepo +MAVEN_REPO="-Dmaven.repo.local=$(pwd)/../mvnrepo" +export MAVEN_OPTS="${MAVEN_REPO}" +mvn install:install-file -Dfile=openmldb-batch.jar -DartifactId=openmldb-batch -DgroupId=com.4paradigm.openmldb -Dversion="${JAR_VERSION}" -Dpackaging=jar +mvn install:install-file -Dfile=openmldb-jdbc.jar -DartifactId=openmldb-jdbc -DgroupId=com.4paradigm.openmldb -Dversion="${JAR_VERSION}" -Dpackaging=jar +mvn install:install-file -Dfile=openmldb-native.jar -DartifactId=openmldb-native -DgroupId=com.4paradigm.openmldb -Dversion="${JAR_VERSION}" -Dpackaging=jar +mvn install:install-file -Dfile=openmldb-spark-connector.jar -DartifactId=openmldb-spark-connector -DgroupId=com.4paradigm.openmldb -Dversion="${JAR_VERSION}" -Dpackaging=jar + +mvn clean install -B -Dmaven.test.skip=true -f test/test-tool/command-tool/pom.xml +mvn clean install -B -Dmaven.test.skip=true -f test/integration-test/openmldb-test-java/pom.xml -Dopenmldb.native.version="${JAR_VERSION}" -Dopenmldb.jdbc.version="${JAR_VERSION}" -Dopenmldb.batch.version="${JAR_VERSION}" +if [[ "${EXECUTE_MODE}" == "javasdk" ]]; then + mvn clean test -B -e -U -DsuiteXmlFile=test_suite/"${CASE_XML}" -f test/integration-test/openmldb-test-java/openmldb-sdk-test/pom.xml -DcaseLevel="${CASE_LEVEL}" -Dopenmldb.native.version="${JAR_VERSION}" -Dopenmldb.jdbc.version="${JAR_VERSION}" -Dopenmldb.batch.version="${JAR_VERSION}" +elif [[ "${EXECUTE_MODE}" == "apiserver" ]]; then + mvn clean test -B -e -U -DsuiteXmlFile=test_suite/"${CASE_XML}" -f test/integration-test/openmldb-test-java/openmldb-http-test/pom.xml -DcaseLevel="${CASE_LEVEL}" -Dopenmldb.native.version="${JAR_VERSION}" -Dopenmldb.jdbc.version="${JAR_VERSION}" -Dopenmldb.batch.version="${JAR_VERSION}" +fi diff --git a/test/steps/openmldb-sdk-test-java-src.sh b/test/steps/openmldb-sdk-test-java-src.sh index 10d30d1f043..c31a2c46d49 100755 --- a/test/steps/openmldb-sdk-test-java-src.sh +++ b/test/steps/openmldb-sdk-test-java-src.sh @@ -88,15 +88,15 @@ echo "deploy config:" cat ${deployConfigPath} # install command tool cd test/test-tool/command-tool || exit -mvn clean install -Dmaven.test.skip=true +mvn clean install -B -Dmaven.test.skip=true cd "${ROOT_DIR}" || exit # modify config sh test/steps/modify_java_sdk_config.sh "${CASE_XML}" "${DEPLOY_MODE}" "${JAVA_SDK_VERSION}" "" "${OPENMLDB_SERVER_VERSION}" "${JAVA_NATIVE_VERSION}" "${TABLE_STORAGE_MODE}" # install jar cd test/integration-test/openmldb-test-java || exit -mvn clean install -Dmaven.test.skip=true +mvn clean install -B -Dmaven.test.skip=true cd "${ROOT_DIR}" || exit # run case cd "${ROOT_DIR}"/test/integration-test/openmldb-test-java/openmldb-sdk-test || exit -mvn clean test -e -U -DsuiteXmlFile=test_suite/"${CASE_XML}" -DcaseLevel="${CASE_LEVEL}" +mvn clean test -B -e -U -DsuiteXmlFile=test_suite/"${CASE_XML}" -DcaseLevel="${CASE_LEVEL}" diff --git a/third-party/CMakeLists.txt b/third-party/CMakeLists.txt index 6a7f8cb0e07..483d3f51128 100644 --- a/third-party/CMakeLists.txt +++ b/third-party/CMakeLists.txt @@ -68,7 +68,7 @@ set(MAKEOPTS "$ENV{MAKEOPTS}" CACHE STRING "Extra options to make") message(STATUS "Install bundled dependencies into ${DEPS_INSTALL_DIR}") set(HYBRIDSQL_ASSERTS_HOME https://github.com/4paradigm/hybridsql-asserts) -set(HYBRIDSQL_ASSERTS_VERSION 0.6.0) +set(HYBRIDSQL_ASSERTS_VERSION 0.6.1) function(get_linux_lsb_release_information) execute_process(COMMAND bash ${CMAKE_SOURCE_DIR}/get-lsb-release.sh @@ -90,17 +90,17 @@ function(init_hybridsql_thirdparty_urls) else() if (LSB_RELEASE_ID_SHORT STREQUAL "centos") set(HYBRIDSQL_ASSERTS_URL "${HYBRIDSQL_ASSERTS_HOME}/releases/download/v${HYBRIDSQL_ASSERTS_VERSION}/thirdparty-${HYBRIDSQL_ASSERTS_VERSION}-linux-gnu-x86_64-centos.tar.gz" PARENT_SCOPE) - set(HYBRIDSQL_ASSERTS_HASH c415dfdc95a127cdce888aec84c7fa3c02f3c9cb973805dcf23b54517e422e36 PARENT_SCOPE) + set(HYBRIDSQL_ASSERTS_HASH 745d0f29cdc0e6073cd83f51e4fdc045622e9027e1cd29f6ef42ca67ac4d726f PARENT_SCOPE) elseif(LSB_RELEASE_ID_SHORT STREQUAL "ubuntu") set(HYBRIDSQL_ASSERTS_URL "${HYBRIDSQL_ASSERTS_HOME}/releases/download/v${HYBRIDSQL_ASSERTS_VERSION}/thirdparty-${HYBRIDSQL_ASSERTS_VERSION}-linux-gnu-x86_64-ubuntu.tar.gz" PARENT_SCOPE) - set(HYBRIDSQL_ASSERTS_HASH 8c95b5fd539c8362d934ae58879d9ae1c27bc0977ca09cc8316ba207e8aaaf1e PARENT_SCOPE) + set(HYBRIDSQL_ASSERTS_HASH 4ee22e1d1b976273c0cb2db54646bc047b1d83f3643697924cff94e9ebd06212 PARENT_SCOPE) else() message(FATAL_ERROR "no pre-compiled thirdparty for your operation system, try compile thirdparty from source with '-DBUILD_BUNDLED=ON'") endif() endif() elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") set(HYBRIDSQL_ASSERTS_URL "${HYBRIDSQL_ASSERTS_HOME}/releases/download/v${HYBRIDSQL_ASSERTS_VERSION}/thirdparty-${HYBRIDSQL_ASSERTS_VERSION}-darwin-i386.tar.gz" PARENT_SCOPE) - set(HYBRIDSQL_ASSERTS_HASH 062e606f1d76fe27003bdc23e643305bfa032eadec8c075e7ce6dc22d70f5044 PARENT_SCOPE) + set(HYBRIDSQL_ASSERTS_HASH bbaef85b441305dc764b403a3f1ef82e11776ceae09b0e3411ab50a5f5adca33 PARENT_SCOPE) endif() endfunction() diff --git a/third-party/cmake/FetchGlog.cmake b/third-party/cmake/FetchGlog.cmake index 8aec8d8c696..458d5052268 100644 --- a/third-party/cmake/FetchGlog.cmake +++ b/third-party/cmake/FetchGlog.cmake @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -set(GLOG_URL https://github.com/google/glog/archive/refs/tags/v0.4.0.tar.gz) +set(GLOG_URL https://github.com/google/glog/archive/refs/tags/v0.6.0.tar.gz) message(STATUS "build glog from ${GLOG_URL}") @@ -20,14 +20,13 @@ find_program(MAKE_EXE NAMES gmake nmake make) ExternalProject_Add( glog URL ${GLOG_URL} - URL_HASH SHA256=f28359aeba12f30d73d9e4711ef356dc842886968112162bc73002645139c39c + URL_HASH SHA256=8a83bf982f37bb70825df71a9709fa90ea9f4447fb3c099e1d720a439d88bad6 PREFIX ${DEPS_BUILD_DIR} DOWNLOAD_DIR ${DEPS_DOWNLOAD_DIR}/glog INSTALL_DIR ${DEPS_INSTALL_DIR} DEPENDS gflags BUILD_IN_SOURCE TRUE - CONFIGURE_COMMAND - ./autogen.sh - COMMAND CXXFLAGS=-fPIC ./configure --prefix= --enable-shared=no --with-gflags= - BUILD_COMMAND ${MAKE_EXE} - INSTALL_COMMAND ${MAKE_EXE} install) + CONFIGURE_COMMAND ${CMAKE_COMMAND} -H -B -DCMAKE_CXX_FLAGS=-fPIC + -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH= -DCMAKE_INSTALL_PREFIX= + BUILD_COMMAND ${CMAKE_COMMAND} --build -- ${MAKEOPTS} + INSTALL_COMMAND ${CMAKE_COMMAND} --build --target install) diff --git a/third-party/cmake/FetchZetasql.cmake b/third-party/cmake/FetchZetasql.cmake index b2b1d580593..bfe1e4c94a0 100644 --- a/third-party/cmake/FetchZetasql.cmake +++ b/third-party/cmake/FetchZetasql.cmake @@ -13,10 +13,10 @@ # limitations under the License. set(ZETASQL_HOME https://github.com/4paradigm/zetasql) -set(ZETASQL_VERSION 0.3.1) -set(ZETASQL_HASH_DARWIN 48bfdfe5fa91d414b0bf8383f116bc2a1f558c12fa286e49ea5ceede366dfbcf) -set(ZETASQL_HASH_LINUX_UBUNTU 3847ed7a60aeda1192adf7d702076d2db2bd49258992e2af67515a57b8f6f6a6) -set(ZETASQL_HASH_LINUX_CENTOS e73e6259ab2df3ae7289a9ae78600b69a8fbb6e4890d07a1031ccb1e37fa4281) +set(ZETASQL_VERSION 0.3.3) +set(ZETASQL_HASH_DARWIN f1c6a4f61b4a3f278dd46ace86f8b5e30780e596ef4af22f22cc12a4a7f83664) +set(ZETASQL_HASH_LINUX_UBUNTU bfe6ef8fd8221e5619dbb66b298ad767a4e1a1326b0c4ccfb75aa9ab872d1ce2) +set(ZETASQL_HASH_LINUX_CENTOS 8b63a149abf9d14fed9e63f465e74c2300d6de7404b859c48a94d4b579d080c2) set(ZETASQL_TAG v${ZETASQL_VERSION}) function(init_zetasql_urls)