+ {{- if eq .Severity "CRITICAL" }} 🔥 CRITICAL
+ {{- else if eq .Severity "HIGH" }} 🚨 HIGH
+ {{- else }} {{ .Severity }} {{- end }}
+
+
{{ .InstalledVersion }}
+
{{ if .FixedVersion }}{{ .FixedVersion }}{{ else }}N/A{{ end }}
+
+ {{- end }}
+
+
+ {{- else }}
+
No Vulnerabilities Found
+ {{- end }}
+{{- end }}
diff --git a/.github/workflows/py-cli-e2e-tests.yml b/.github/workflows/py-cli-e2e-tests.yml
index 9acbfd797753..138f4095e06e 100644
--- a/.github/workflows/py-cli-e2e-tests.yml
+++ b/.github/workflows/py-cli-e2e-tests.yml
@@ -65,7 +65,7 @@ jobs:
- name: configure aws credentials
if: contains('quicksight', matrix.e2e-test) || contains('datalake_s3', matrix.e2e-test) || contains('athena', matrix.e2e-test)
- uses: aws-actions/configure-aws-credentials@v1
+ uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.E2E_AWS_IAM_ROLE_ARN }}
role-session-name: github-ci-aws-e2e-tests
diff --git a/.github/workflows/py-tests.yml b/.github/workflows/py-tests.yml
index 586489b012d8..7ca4a5f620d4 100644
--- a/.github/workflows/py-tests.yml
+++ b/.github/workflows/py-tests.yml
@@ -55,7 +55,7 @@ jobs:
docker-images: false
- name: Wait for the labeler
- uses: lewagon/wait-on-check-action@v1.3.3
+ uses: lewagon/wait-on-check-action@v1.3.4
if: ${{ github.event_name == 'pull_request_target' }}
with:
ref: ${{ github.event.pull_request.head.sha }}
diff --git a/.github/workflows/trivy-scan-ingestion-base-slim-image.yml b/.github/workflows/trivy-scan-ingestion-base-slim-image.yml
new file mode 100644
index 000000000000..3805100da053
--- /dev/null
+++ b/.github/workflows/trivy-scan-ingestion-base-slim-image.yml
@@ -0,0 +1,81 @@
+name: Trivy Scan For OpenMetadata Ingestion Base Slim Docker Image
+
+on:
+ pull_request_target:
+ types: [labeled, opened, synchronize, reopened]
+ paths:
+ - "ingestion/**"
+ - "openmetadata-service/**"
+ - "openmetadata-spec/src/main/resources/json/schema/**"
+ - "pom.xml"
+ - "Makefile"
+
+concurrency:
+ group: trivy-ingestion-base-slim-scan-${{ github.head_ref || github.run_id }}
+ cancel-in-progress: true
+
+jobs:
+ build-and-scan:
+ runs-on: ubuntu-latest
+ permissions:
+ pull-requests: write
+ steps:
+ - name: Wait for the labeler
+ uses: lewagon/wait-on-check-action@v1.3.3
+ if: ${{ github.event_name == 'pull_request_target' }}
+ with:
+ ref: ${{ github.event.pull_request.head.sha }}
+ check-name: Team Label
+ repo-token: ${{ secrets.GITHUB_TOKEN }}
+ wait-interval: 90
+
+ - name: Verify PR labels
+ uses: jesusvasquez333/verify-pr-label-action@v1.4.0
+ if: ${{ github.event_name == 'pull_request_target' }}
+ with:
+ github-token: '${{ secrets.GITHUB_TOKEN }}'
+ valid-labels: 'safe to test'
+ pull-request-number: '${{ github.event.pull_request.number }}'
+ disable-reviews: true # To not auto approve changes
+
+ - name: Checkout Repository
+ uses: actions/checkout@v4
+ with:
+ ref: ${{ github.event.pull_request.head.sha }}
+
+ - name: Prepare for Docker Build
+ id: prepare
+ uses: ./.github/actions/prepare-for-docker-build
+ with:
+ image: openmetadata-ingestion-base-slim
+ tag: trivy
+ is_ingestion: true
+
+
+ - name: Build Docker Image
+ run: |
+ docker build -t openmetadata-ingestion-base-slim:trivy -f ingestion/operators/docker/Dockerfile.ci .
+
+
+ - name: Run Trivy Image Scan
+ id: trivy_scan
+ uses: aquasecurity/trivy-action@master
+ with:
+ scan-type: "image"
+ image-ref: openmetadata-ingestion-base-slim:trivy
+ hide-progress: false
+ ignore-unfixed: true
+ severity: "HIGH,CRITICAL"
+ skip-dirs: "/opt/airflow/dags,/home/airflow/ingestion/pipelines"
+ scan-ref: .
+ format: 'template'
+ template: "@.github/trivy/templates/github.tpl"
+ output: "trivy-result-ingestion-base-slim.md"
+ env:
+ TRIVY_DISABLE_VEX_NOTICE: "true"
+
+ - name: Comment Trivy Scan Results on PR
+ uses: marocchino/sticky-pull-request-comment@v2
+ with:
+ path: trivy-result-ingestion-base-slim.md
+ header: "trivy-scan-${{ github.workflow }}"
diff --git a/.github/workflows/trivy-scan-ingestion-image.yml b/.github/workflows/trivy-scan-ingestion-image.yml
new file mode 100644
index 000000000000..5ac8156bf1a1
--- /dev/null
+++ b/.github/workflows/trivy-scan-ingestion-image.yml
@@ -0,0 +1,81 @@
+name: Trivy Scan For OpenMetadata Ingestion Docker Image
+
+on:
+ pull_request_target:
+ types: [labeled, opened, synchronize, reopened]
+ paths:
+ - "ingestion/**"
+ - "openmetadata-service/**"
+ - "openmetadata-spec/src/main/resources/json/schema/**"
+ - "pom.xml"
+ - "Makefile"
+
+concurrency:
+ group: trivy-ingestion-scan-${{ github.head_ref || github.run_id }}
+ cancel-in-progress: true
+
+jobs:
+ build-and-scan:
+ runs-on: ubuntu-latest
+ permissions:
+ pull-requests: write
+ steps:
+ - name: Wait for the labeler
+ uses: lewagon/wait-on-check-action@v1.3.3
+ if: ${{ github.event_name == 'pull_request_target' }}
+ with:
+ ref: ${{ github.event.pull_request.head.sha }}
+ check-name: Team Label
+ repo-token: ${{ secrets.GITHUB_TOKEN }}
+ wait-interval: 90
+
+ - name: Verify PR labels
+ uses: jesusvasquez333/verify-pr-label-action@v1.4.0
+ if: ${{ github.event_name == 'pull_request_target' }}
+ with:
+ github-token: '${{ secrets.GITHUB_TOKEN }}'
+ valid-labels: 'safe to test'
+ pull-request-number: '${{ github.event.pull_request.number }}'
+ disable-reviews: true # To not auto approve changes
+
+ - name: Checkout Repository
+ uses: actions/checkout@v4
+ with:
+ ref: ${{ github.event.pull_request.head.sha }}
+
+ - name: Prepare for Docker Build
+ id: prepare
+ uses: ./.github/actions/prepare-for-docker-build
+ with:
+ image: openmetadata-ingestion
+ tag: trivy
+ is_ingestion: true
+
+
+ - name: Build Docker Image
+ run: |
+ docker build -t openmetadata-ingestion:trivy -f ingestion/Dockerfile.ci .
+
+
+ - name: Run Trivy Image Scan
+ id: trivy_scan
+ uses: aquasecurity/trivy-action@master
+ with:
+ scan-type: "image"
+ image-ref: openmetadata-ingestion:trivy
+ hide-progress: false
+ ignore-unfixed: true
+ severity: "HIGH,CRITICAL"
+ skip-dirs: "/opt/airflow/dags,/home/airflow/ingestion/pipelines"
+ scan-ref: .
+ format: 'template'
+ template: "@.github/trivy/templates/github.tpl"
+ output: "trivy-results-ingestion.md"
+ env:
+ TRIVY_DISABLE_VEX_NOTICE: "true"
+
+ - name: Comment Trivy Scan Results on PR
+ uses: marocchino/sticky-pull-request-comment@v2
+ with:
+ path: trivy-results-ingestion.md
+ header: "trivy-scan-${{ github.workflow }}"
diff --git a/.github/workflows/trivy-scan-openmetadta-server.yml b/.github/workflows/trivy-scan-openmetadta-server.yml
new file mode 100644
index 000000000000..615a1229555a
--- /dev/null
+++ b/.github/workflows/trivy-scan-openmetadta-server.yml
@@ -0,0 +1,80 @@
+name: Trivy Scan For OpenMetadata Server Docker Image
+on:
+ pull_request_target:
+ types: [labeled, opened, synchronize, reopened]
+ paths:
+ - "openmetadata-service/**"
+ - "openmetadata-spec/src/main/resources/json/schema/**"
+ - "openmetadata-dist/**"
+ - "openmetadata-clients/**"
+ - "common/**"
+ - "pom.xml"
+ - "yarn.lock"
+ - "Makefile"
+ - "bootstrap/**"
+concurrency:
+ group: trivy-server-scan-${{ github.head_ref || github.run_id }}
+ cancel-in-progress: true
+jobs:
+ build-and-scan:
+ runs-on: ubuntu-latest
+ permissions:
+ pull-requests: write
+ steps:
+ - name: Wait for the labeler
+ uses: lewagon/wait-on-check-action@v1.3.3
+ if: ${{ github.event_name == 'pull_request_target' }}
+ with:
+ ref: ${{ github.event.pull_request.head.sha }}
+ check-name: Team Label
+ repo-token: ${{ secrets.GITHUB_TOKEN }}
+ wait-interval: 90
+
+ - name: Verify PR labels
+ uses: jesusvasquez333/verify-pr-label-action@v1.4.0
+ if: ${{ github.event_name == 'pull_request_target' }}
+ with:
+ github-token: '${{ secrets.GITHUB_TOKEN }}'
+ valid-labels: 'safe to test'
+ pull-request-number: '${{ github.event.pull_request.number }}'
+ disable-reviews: true # To not auto approve changes
+
+ - name: Checkout Repository
+ uses: actions/checkout@v4
+ with:
+ ref: ${{ github.event.pull_request.head.sha }}
+
+ - name: Prepare for Docker Build
+ id: prepare
+ uses: ./.github/actions/prepare-for-docker-build
+ with:
+ image: openmetadata-server
+ tag: trivy
+ is_ingestion: false
+
+ - name: Build Docker Image
+ run: |
+ docker build -t openmetadata-server:trivy -f docker/development/Dockerfile .
+
+ - name: Run Trivy Image Scan
+ id: trivy_scan
+ uses: aquasecurity/trivy-action@master
+ with:
+ scan-type: "image"
+ image-ref: openmetadata-server:trivy
+ hide-progress: false
+ ignore-unfixed: true
+ severity: "HIGH,CRITICAL"
+ scan-ref: .
+ format: 'template'
+ template: "@.github/trivy/templates/github.tpl"
+ output: trivy-result-openmetadata-server.md
+ env:
+ TRIVY_DISABLE_VEX_NOTICE: "true"
+
+ - name: Comment Trivy Scan Results on PR
+ uses: marocchino/sticky-pull-request-comment@v2
+ with:
+ path: trivy-result-openmetadata-server.md
+ header: "trivy-scan-${{ github.workflow }}"
+
diff --git a/README.md b/README.md
index 54f9d2547669..3df44614a9b0 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ Contents:
- [Features](#key-features-of-openmetadata)
- [Try our Sandbox](#try-our-sandbox)
- [Install & Run](#install-and-run-openmetadata)
-- [Roadmap](https://docs.open-metadata.org/v1.3.x/roadmap)
+- [Roadmap](https://docs.open-metadata.org/latest/roadmap)
- [Documentation and Support](#documentation-and-support)
- [Contributors](#contributors)
diff --git a/docker/development/docker-compose.yml b/docker/development/docker-compose.yml
index b5509137a521..ceaef3cececc 100644
--- a/docker/development/docker-compose.yml
+++ b/docker/development/docker-compose.yml
@@ -492,7 +492,7 @@ services:
DB_HOST: ${AIRFLOW_DB_HOST:-mysql}
DB_PORT: ${AIRFLOW_DB_PORT:-3306}
AIRFLOW_DB: ${AIRFLOW_DB:-airflow_db}
- DB_SCHEME: ${AIRFLOW_DB_SCHEME:-mysql+pymysql}
+ DB_SCHEME: ${AIRFLOW_DB_SCHEME:-mysql+mysqldb}
DB_USER: ${AIRFLOW_DB_USER:-airflow_user}
DB_PASSWORD: ${AIRFLOW_DB_PASSWORD:-airflow_pass}
diff --git a/docker/docker-compose-ingestion/docker-compose-ingestion.yml b/docker/docker-compose-ingestion/docker-compose-ingestion.yml
index b6adb2319e31..00749041d98c 100644
--- a/docker/docker-compose-ingestion/docker-compose-ingestion.yml
+++ b/docker/docker-compose-ingestion/docker-compose-ingestion.yml
@@ -26,7 +26,7 @@ services:
DB_HOST: ${AIRFLOW_DB_HOST:-mysql}
DB_PORT: ${AIRFLOW_DB_PORT:-3306}
AIRFLOW_DB: ${AIRFLOW_DB:-airflow_db}
- DB_SCHEME: ${AIRFLOW_DB_SCHEME:-mysql+pymysql}
+ DB_SCHEME: ${AIRFLOW_DB_SCHEME:-mysql+mysqldb}
DB_USER: ${AIRFLOW_DB_USER:-airflow_user}
DB_PASSWORD: ${AIRFLOW_DB_PASSWORD:-airflow_pass}
# extra connection-string properties for the database
diff --git a/docker/docker-compose-quickstart/docker-compose.yml b/docker/docker-compose-quickstart/docker-compose.yml
index a66b57ddfb63..0bec2001ba72 100644
--- a/docker/docker-compose-quickstart/docker-compose.yml
+++ b/docker/docker-compose-quickstart/docker-compose.yml
@@ -497,7 +497,7 @@ services:
DB_HOST: ${AIRFLOW_DB_HOST:-mysql}
DB_PORT: ${AIRFLOW_DB_PORT:-3306}
AIRFLOW_DB: ${AIRFLOW_DB:-airflow_db}
- DB_SCHEME: ${AIRFLOW_DB_SCHEME:-mysql+pymysql}
+ DB_SCHEME: ${AIRFLOW_DB_SCHEME:-mysql+mysqldb}
DB_USER: ${AIRFLOW_DB_USER:-airflow_user}
DB_PASSWORD: ${AIRFLOW_DB_PASSWORD:-airflow_pass}
# extra connection-string properties for the database
diff --git a/docker/images/minimal-ubuntu/Dockerfile b/docker/images/minimal-ubuntu/Dockerfile
index 5457d209599c..5316f7f75979 100644
--- a/docker/images/minimal-ubuntu/Dockerfile
+++ b/docker/images/minimal-ubuntu/Dockerfile
@@ -15,7 +15,7 @@
FROM ubuntu:xenial-20210416
# environment variables
-ENV DEBIAN_FRONTEND noninteractive
+ENV DEBIAN_FRONTEND=noninteractive
# update
RUN apt update -y && \
diff --git a/ingestion/Dockerfile b/ingestion/Dockerfile
index cc0df5ad259c..2be62d4c75e3 100644
--- a/ingestion/Dockerfile
+++ b/ingestion/Dockerfile
@@ -1,6 +1,6 @@
-FROM mysql:8.3 as mysql
+FROM mysql:8.3 AS mysql
-FROM apache/airflow:2.9.1-python3.10
+FROM apache/airflow:2.9.3-python3.10
USER root
RUN curl -sS https://packages.microsoft.com/keys/microsoft.asc | apt-key add -
RUN curl -sS https://packages.microsoft.com/config/debian/11/prod.list > /etc/apt/sources.list.d/mssql-release.list
@@ -78,7 +78,7 @@ ENV PIP_NO_CACHE_DIR=1
ENV PIP_QUIET=1
ARG RI_VERSION="1.6.0.0.dev0"
RUN pip install --upgrade pip
-RUN pip install "openmetadata-managed-apis~=${RI_VERSION}" --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.9.1/constraints-3.10.txt"
+RUN pip install "openmetadata-managed-apis~=${RI_VERSION}" --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.9.3/constraints-3.10.txt"
RUN pip install "openmetadata-ingestion[${INGESTION_DEPENDENCY}]~=${RI_VERSION}"
# Temporary workaround for https://github.com/open-metadata/OpenMetadata/issues/9593
diff --git a/ingestion/Dockerfile.ci b/ingestion/Dockerfile.ci
index f8433f776ca3..3e6f0346a220 100644
--- a/ingestion/Dockerfile.ci
+++ b/ingestion/Dockerfile.ci
@@ -1,6 +1,6 @@
-FROM mysql:8.3 as mysql
+FROM mysql:8.3 AS mysql
-FROM apache/airflow:2.9.1-python3.10
+FROM apache/airflow:2.9.3-python3.10
USER root
RUN curl -sS https://packages.microsoft.com/keys/microsoft.asc | apt-key add -
RUN curl -sS https://packages.microsoft.com/config/debian/11/prod.list > /etc/apt/sources.list.d/mssql-release.list
@@ -73,7 +73,7 @@ COPY --chown=airflow:0 openmetadata-airflow-apis /home/airflow/openmetadata-airf
COPY --chown=airflow:0 ingestion/examples/airflow/dags /opt/airflow/dags
USER airflow
-ARG AIRFLOW_CONSTRAINTS_LOCATION="https://raw.githubusercontent.com/apache/airflow/constraints-2.9.1/constraints-3.10.txt"
+ARG AIRFLOW_CONSTRAINTS_LOCATION="https://raw.githubusercontent.com/apache/airflow/constraints-2.9.3/constraints-3.10.txt"
# Disable pip cache dir
# https://pip.pypa.io/en/stable/topics/caching/#avoiding-caching
diff --git a/ingestion/ingestion_dependency.sh b/ingestion/ingestion_dependency.sh
index ee54d6f6ac95..2b8372f852f4 100755
--- a/ingestion/ingestion_dependency.sh
+++ b/ingestion/ingestion_dependency.sh
@@ -15,7 +15,7 @@ DB_PORT=${DB_PORT:-3306}
AIRFLOW_DB=${AIRFLOW_DB:-airflow_db}
DB_USER=${DB_USER:-airflow_user}
-DB_SCHEME=${DB_SCHEME:-mysql+pymysql}
+DB_SCHEME=${DB_SCHEME:-mysql+mysqldb}
DB_PASSWORD=${DB_PASSWORD:-airflow_pass}
DB_PROPERTIES=${DB_PROPERTIES:-""}
diff --git a/ingestion/operators/docker/Dockerfile b/ingestion/operators/docker/Dockerfile
index d5d57088d256..c44d33106a3a 100644
--- a/ingestion/operators/docker/Dockerfile
+++ b/ingestion/operators/docker/Dockerfile
@@ -41,6 +41,7 @@ RUN dpkg --configure -a \
&& rm -rf /var/lib/apt/lists/*
# Add updated postgres/redshift dependencies based on libq
+ENV DEBIAN_FRONTEND=noninteractive
RUN curl https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add -
RUN echo "deb https://apt.postgresql.org/pub/repos/apt/ bullseye-pgdg main" > /etc/apt/sources.list.d/pgdg.list; \
apt-get -qq update; \
diff --git a/ingestion/operators/docker/Dockerfile.ci b/ingestion/operators/docker/Dockerfile.ci
index 27752f954383..a12409b92956 100644
--- a/ingestion/operators/docker/Dockerfile.ci
+++ b/ingestion/operators/docker/Dockerfile.ci
@@ -41,6 +41,7 @@ RUN apt-get -qq update \
&& rm -rf /var/lib/apt/lists/*
# Add updated postgres/redshift dependencies based on libq
+ENV DEBIAN_FRONTEND=noninteractive
RUN curl -sS https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add -
RUN echo "deb https://apt.postgresql.org/pub/repos/apt/ bullseye-pgdg main" > /etc/apt/sources.list.d/pgdg.list; \
apt-get -qq update; \
diff --git a/ingestion/setup.py b/ingestion/setup.py
index 96c426aef8f7..fa0ec7614286 100644
--- a/ingestion/setup.py
+++ b/ingestion/setup.py
@@ -19,7 +19,7 @@
# Add here versions required for multiple plugins
VERSIONS = {
- "airflow": "apache-airflow==2.9.1",
+ "airflow": "apache-airflow==2.9.3",
"adlfs": "adlfs>=2023.1.0",
"avro": "avro>=1.11.3,<1.12",
"boto3": "boto3>=1.20,<2.0", # No need to add botocore separately. It's a dep from boto3
@@ -56,9 +56,13 @@
"elasticsearch8": "elasticsearch8~=8.9.0",
"giturlparse": "giturlparse",
"validators": "validators~=0.22.0",
- "teradata": "teradatasqlalchemy>=20.0.0.0",
+ "teradata": "teradatasqlalchemy==20.0.0.2",
"cockroach": "sqlalchemy-cockroachdb~=2.0",
"cassandra": "cassandra-driver>=3.28.0",
+ "pydoris": "pydoris==1.0.2",
+ "pyiceberg": "pyiceberg==0.5.1",
+ "google-cloud-bigtable": "google-cloud-bigtable>=2.0.0",
+ "pyathena": "pyathena~=3.0",
}
COMMONS = {
@@ -98,7 +102,7 @@
DATA_DIFF = {
driver: f"collate-data-diff[{driver}]"
# data-diff uses different drivers out-of-the-box than OpenMetadata
- # the exrtas are described here:
+ # the extras are described here:
# https://github.com/open-metadata/collate-data-diff/blob/main/pyproject.toml#L68
# install all data diffs with "pip install collate-data-diff[all-dbs]"
for driver in [
@@ -143,8 +147,13 @@
"tabulate==0.9.0",
"typing-inspect",
"packaging", # For version parsing
+ "setuptools~=70.0",
"shapely",
"collate-data-diff",
+ # TODO: Remove one once we have updated datadiff version
+ "snowflake-connector-python>=3.13.1,<4.0.0",
+ "mysql-connector-python>=8.0.29;python_version<'3.9'",
+ "mysql-connector-python>=9.1;python_version>='3.9'",
}
plugins: Dict[str, Set[str]] = {
@@ -155,7 +164,7 @@
VERSIONS["airflow"],
}, # Same as ingestion container. For development.
"amundsen": {VERSIONS["neo4j"]},
- "athena": {"pyathena~=3.0"},
+ "athena": {VERSIONS["pyathena"]},
"atlas": {},
"azuresql": {VERSIONS["pyodbc"]},
"azure-sso": {VERSIONS["msal"]},
@@ -168,7 +177,11 @@
VERSIONS["numpy"],
"sqlalchemy-bigquery>=1.2.2",
},
- "bigtable": {"google-cloud-bigtable>=2.0.0", VERSIONS["pandas"], VERSIONS["numpy"]},
+ "bigtable": {
+ VERSIONS["google-cloud-bigtable"],
+ VERSIONS["pandas"],
+ VERSIONS["numpy"],
+ },
"clickhouse": {
"clickhouse-driver~=0.2",
"clickhouse-sqlalchemy~=0.2",
@@ -247,7 +260,7 @@
"impyla~=0.18.0",
},
"iceberg": {
- "pyiceberg==0.5.1",
+ VERSIONS["pyiceberg"],
# Forcing the version of a few packages so it plays nicely with other requirements.
VERSIONS["pydantic"],
VERSIONS["adlfs"],
@@ -313,7 +326,7 @@
VERSIONS["geoalchemy2"],
},
"sagemaker": {VERSIONS["boto3"]},
- "salesforce": {"simple_salesforce~=1.11"},
+ "salesforce": {"simple_salesforce~=1.11", "authlib>=1.3.1"},
"sample-data": {VERSIONS["avro"], VERSIONS["grpc-tools"]},
"sap-hana": {"hdbcli", "sqlalchemy-hana"},
"sas": {},
@@ -386,6 +399,8 @@
VERSIONS["grpc-tools"],
VERSIONS["neo4j"],
VERSIONS["cockroach"],
+ VERSIONS["pydoris"],
+ VERSIONS["pyiceberg"],
"testcontainers==3.7.1;python_version<'3.9'",
"testcontainers~=4.8.0;python_version>='3.9'",
"minio==7.2.5",
@@ -404,6 +419,13 @@
*plugins["dagster"],
*plugins["oracle"],
*plugins["mssql"],
+ VERSIONS["validators"],
+ VERSIONS["pyathena"],
+ VERSIONS["pyiceberg"],
+ VERSIONS["pydoris"],
+ "python-liquid",
+ VERSIONS["google-cloud-bigtable"],
+ *plugins["bigquery"],
}
e2e_test = {
diff --git a/ingestion/src/metadata/ingestion/lineage/masker.py b/ingestion/src/metadata/ingestion/lineage/masker.py
index 69aab2d7ba01..e55783934052 100644
--- a/ingestion/src/metadata/ingestion/lineage/masker.py
+++ b/ingestion/src/metadata/ingestion/lineage/masker.py
@@ -127,4 +127,4 @@ def mask_query(
except Exception as exc:
logger.debug(f"Failed to mask query with sqlfluff: {exc}")
logger.debug(traceback.format_exc())
- return query
+ return None
diff --git a/ingestion/src/metadata/ingestion/lineage/parser.py b/ingestion/src/metadata/ingestion/lineage/parser.py
index 93bae226d74f..b9925fb1e35e 100644
--- a/ingestion/src/metadata/ingestion/lineage/parser.py
+++ b/ingestion/src/metadata/ingestion/lineage/parser.py
@@ -338,7 +338,7 @@ def stateful_add_joins_from_statement(
logger.debug(
f"Can't extract table names when parsing JOIN information from {comparison}"
)
- logger.debug(f"Query: {self.masked_query}")
+ logger.debug(f"Query: {self.masked_query or self.query}")
continue
left_table_column = TableColumn(table=table_left, column=column_left)
@@ -463,7 +463,7 @@ def get_sqlfluff_lineage_runner(qry: str, dlct: str) -> LineageRunner:
self.masked_query = mask_query(self._clean_query, parser=lr_sqlparser)
logger.debug(
- f"Using sqlparse for lineage parsing for query: {self.masked_query}"
+ f"Using sqlparse for lineage parsing for query: {self.masked_query or self.query}"
)
return lr_sqlparser
diff --git a/ingestion/src/metadata/ingestion/lineage/sql_lineage.py b/ingestion/src/metadata/ingestion/lineage/sql_lineage.py
index f7bd21265fbd..577e6a87498a 100644
--- a/ingestion/src/metadata/ingestion/lineage/sql_lineage.py
+++ b/ingestion/src/metadata/ingestion/lineage/sql_lineage.py
@@ -625,8 +625,8 @@ def get_lineage_by_query(
try:
lineage_parser = LineageParser(query, dialect, timeout_seconds=timeout_seconds)
- masked_query = lineage_parser.masked_query or query
- logger.debug(f"Running lineage with query: {masked_query}")
+ masked_query = lineage_parser.masked_query
+ logger.debug(f"Running lineage with query: {masked_query or query}")
raw_column_lineage = lineage_parser.column_lineage
column_lineage.update(populate_column_lineage_map(raw_column_lineage))
@@ -697,7 +697,7 @@ def get_lineage_by_query(
if not lineage_parser.query_parsing_success:
query_parsing_failures.add(
QueryParsingError(
- query=masked_query,
+ query=masked_query or query,
error=lineage_parser.query_parsing_failure_reason,
)
)
@@ -729,8 +729,10 @@ def get_lineage_via_table_entity(
try:
lineage_parser = LineageParser(query, dialect, timeout_seconds=timeout_seconds)
- masked_query = lineage_parser.masked_query or query
- logger.debug(f"Getting lineage via table entity using query: {masked_query}")
+ masked_query = lineage_parser.masked_query
+ logger.debug(
+ f"Getting lineage via table entity using query: {masked_query or query}"
+ )
to_table_name = table_entity.name.root
for from_table_name in lineage_parser.source_tables:
diff --git a/ingestion/src/metadata/ingestion/ometa/mixins/query_mixin.py b/ingestion/src/metadata/ingestion/ometa/mixins/query_mixin.py
index be2de4fbf4a0..c0b217a804e8 100644
--- a/ingestion/src/metadata/ingestion/ometa/mixins/query_mixin.py
+++ b/ingestion/src/metadata/ingestion/ometa/mixins/query_mixin.py
@@ -42,6 +42,8 @@ def _get_query_hash(self, query: str) -> str:
return str(result.hexdigest())
def _get_or_create_query(self, query: CreateQueryRequest) -> Optional[Query]:
+ if query.query.root is None:
+ return None
query_hash = self._get_query_hash(query=query.query.root)
query_entity = self.get_by_name(entity=Query, fqn=query_hash)
if query_entity is None:
diff --git a/ingestion/src/metadata/ingestion/ometa/routes.py b/ingestion/src/metadata/ingestion/ometa/routes.py
index 8759c663059b..b8538923110a 100644
--- a/ingestion/src/metadata/ingestion/ometa/routes.py
+++ b/ingestion/src/metadata/ingestion/ometa/routes.py
@@ -92,6 +92,7 @@
from metadata.generated.schema.api.services.ingestionPipelines.createIngestionPipeline import (
CreateIngestionPipelineRequest,
)
+from metadata.generated.schema.api.teams.createPersona import CreatePersonaRequest
from metadata.generated.schema.api.teams.createRole import CreateRoleRequest
from metadata.generated.schema.api.teams.createTeam import CreateTeamRequest
from metadata.generated.schema.api.teams.createUser import CreateUserRequest
@@ -157,6 +158,7 @@
from metadata.generated.schema.entity.services.pipelineService import PipelineService
from metadata.generated.schema.entity.services.searchService import SearchService
from metadata.generated.schema.entity.services.storageService import StorageService
+from metadata.generated.schema.entity.teams.persona import Persona
from metadata.generated.schema.entity.teams.role import Role
from metadata.generated.schema.entity.teams.team import Team
from metadata.generated.schema.entity.teams.user import AuthenticationMechanism, User
@@ -217,6 +219,8 @@
CreateTeamRequest.__name__: "/teams",
User.__name__: "/users",
CreateUserRequest.__name__: "/users",
+ Persona.__name__: "/personas",
+ CreatePersonaRequest.__name__: "/personas",
AuthenticationMechanism.__name__: "/users/auth-mechanism",
Bot.__name__: "/bots",
CreateBot.__name__: "/bots",
diff --git a/ingestion/src/metadata/ingestion/source/dashboard/looker/metadata.py b/ingestion/src/metadata/ingestion/source/dashboard/looker/metadata.py
index 598ca5d591bf..fafc59429bb5 100644
--- a/ingestion/src/metadata/ingestion/source/dashboard/looker/metadata.py
+++ b/ingestion/src/metadata/ingestion/source/dashboard/looker/metadata.py
@@ -21,6 +21,7 @@
"""
import copy
import os
+import re
import traceback
from datetime import datetime
from pathlib import Path
@@ -39,6 +40,7 @@
import giturlparse
import lkml
+import networkx as nx
from liquid import Template
from looker_sdk.sdk.api40.methods import Looker40SDK
from looker_sdk.sdk.api40.models import Dashboard as LookerDashboard
@@ -134,6 +136,10 @@
LIST_DASHBOARD_FIELDS = ["id", "title"]
IMPORTED_PROJECTS_DIR = "imported_projects"
+# we need to find the derived references in the SQL query using regex
+# https://cloud.google.com/looker/docs/derived-tables#referencing_derived_tables_in_other_derived_tables
+DERIVED_REFERENCES = r"\${([\w\s\d_.]+)\.SQL_TABLE_NAME}"
+
# Here we can update the fields to get further information, such as:
# created_at, updated_at, last_updater_id, deleted_at, deleter_id, favorite_count, last_viewed_at
GET_DASHBOARD_FIELDS = [
@@ -165,6 +171,13 @@ def build_datamodel_name(model_name: str, explore_name: str) -> str:
return clean_dashboard_name(model_name + "_" + explore_name)
+def find_derived_references(sql_query: str) -> List[str]:
+ if sql_query is None:
+ return []
+ matches = re.findall(DERIVED_REFERENCES, sql_query)
+ return matches
+
+
class LookerSource(DashboardServiceSource):
"""
Looker Source Class.
@@ -172,6 +185,8 @@ class LookerSource(DashboardServiceSource):
Its client uses Looker 40 from the SDK: client = looker_sdk.init40()
"""
+ # pylint: disable=too-many-instance-attributes
+
config: WorkflowSource
metadata: OpenMetadata
client: Looker40SDK
@@ -192,6 +207,10 @@ def __init__(
self._main__lookml_manifest: Optional[LookMLManifest] = None
self._view_data_model: Optional[DashboardDataModel] = None
+ self._parsed_views: Optional[Dict[str, str]] = {}
+ self._unparsed_views: Optional[Dict[str, str]] = {}
+ self._derived_dependencies = nx.DiGraph()
+
self._added_lineage: Optional[Dict] = {}
@classmethod
@@ -557,6 +576,68 @@ def _process_view(
)
)
+ def replace_derived_references(self, sql_query):
+ """
+ Replace all derived references with the parsed views sql query
+ will replace the derived references in the SQL query using regex
+ for e.g. It will replace ${view_name.SQL_TABLE_NAME} with the parsed view query for view_name
+ https://cloud.google.com/looker/docs/derived-tables#referencing_derived_tables_in_other_derived_tables
+ """
+ try:
+ sql_query = re.sub(
+ DERIVED_REFERENCES,
+ # from `${view_name.SQL_TABLE_NAME}` we want the `view_name`.
+ # match.group(1) will give us the `view_name`
+ lambda match: f"({self._parsed_views.get(match.group(1), match.group(0))})",
+ sql_query,
+ )
+ except Exception as e:
+ logger.warning(
+ f"Something went wrong while replacing derived view references: {e}"
+ )
+ return sql_query
+
+ def build_lineage_for_unparsed_views(self) -> Iterable[Either[AddLineageRequest]]:
+ """
+ build lineage by parsing the unparsed views containing derived references
+ """
+ try:
+ # Doing a reversed topological sort to process the views in the right order
+ for view_name in reversed(
+ list(nx.topological_sort(self._derived_dependencies))
+ ):
+ if view_name in self._parsed_views:
+ # Skip if already processed
+ continue
+ sql_query = self.replace_derived_references(
+ self._unparsed_views[view_name]
+ )
+ if view_references := find_derived_references(sql_query):
+ # There are still derived references in the view query
+ logger.debug(
+ f"Views {view_references} not found for {view_name}. Skipping."
+ )
+ continue
+ self._parsed_views[view_name] = sql_query
+ del self._unparsed_views[view_name]
+ yield from self._build_lineage_for_view(view_name, sql_query)
+
+ except Exception as err:
+ yield Either(
+ left=StackTraceError(
+ name="parse_unparsed_views",
+ error=f"Error parsing unparsed views: {err}",
+ stackTrace=traceback.format_exc(),
+ )
+ )
+
+ def _add_dependency_edge(self, view_name: str, view_references: List[str]):
+ """
+ Add a dependency edge between the view and the derived reference
+ """
+ for dependent_view_name in view_references:
+ self._derived_dependencies.add_edge(view_name, dependent_view_name)
+
def add_view_lineage(
self, view: LookMlView, explore: LookmlModelExplore
) -> Iterable[Either[AddLineageRequest]]:
@@ -589,6 +670,7 @@ def add_view_lineage(
for db_service_name in db_service_names or []:
dialect = self._get_db_dialect(db_service_name)
source_table_name = self._clean_table_name(sql_table_name, dialect)
+ self._parsed_views[view.name] = source_table_name
# View to the source is only there if we are informing the dbServiceNames
yield self.build_lineage_request(
@@ -601,20 +683,19 @@ def add_view_lineage(
sql_query = view.derived_table.sql
if not sql_query:
return
+ if find_derived_references(sql_query):
+ sql_query = self.replace_derived_references(sql_query)
+ # If we still have derived references, we cannot process the view
+ if view_references := find_derived_references(sql_query):
+ self._add_dependency_edge(view.name, view_references)
+ logger.warning(
+ f"Not all references are replaced for view [{view.name}]. Parsing it later."
+ )
+ return
logger.debug(f"Processing view [{view.name}] with SQL: \n[{sql_query}]")
- for db_service_name in db_service_names or []:
- lineage_parser = LineageParser(
- sql_query,
- self._get_db_dialect(db_service_name),
- timeout_seconds=30,
- )
- if lineage_parser.source_tables:
- for from_table_name in lineage_parser.source_tables:
- yield self.build_lineage_request(
- source=str(from_table_name),
- db_service_name=db_service_name,
- to_entity=self._view_data_model,
- )
+ yield from self._build_lineage_for_view(view.name, sql_query)
+ if self._unparsed_views:
+ self.build_lineage_for_unparsed_views()
except Exception as err:
yield Either(
@@ -625,6 +706,27 @@ def add_view_lineage(
)
)
+ def _build_lineage_for_view(
+ self, view_name: str, sql_query: str
+ ) -> Iterable[Either[AddLineageRequest]]:
+ """
+ Parse the SQL query and build lineage for the view.
+ """
+ for db_service_name in self.get_db_service_names() or []:
+ lineage_parser = LineageParser(
+ sql_query,
+ self._get_db_dialect(db_service_name),
+ timeout_seconds=30,
+ )
+ if lineage_parser.source_tables:
+ self._parsed_views[view_name] = sql_query
+ for from_table_name in lineage_parser.source_tables:
+ yield self.build_lineage_request(
+ source=str(from_table_name),
+ db_service_name=db_service_name,
+ to_entity=self._view_data_model,
+ )
+
def _get_db_dialect(self, db_service_name) -> Dialect:
db_service = self.metadata.get_by_name(DatabaseService, db_service_name)
return ConnectionTypeDialectMapper.dialect_of(
diff --git a/ingestion/src/metadata/ingestion/source/dashboard/powerbi/client.py b/ingestion/src/metadata/ingestion/source/dashboard/powerbi/client.py
index 381094e18010..cfaa47379ef0 100644
--- a/ingestion/src/metadata/ingestion/source/dashboard/powerbi/client.py
+++ b/ingestion/src/metadata/ingestion/source/dashboard/powerbi/client.py
@@ -47,7 +47,10 @@
logger = utils_logger()
-
+GETGROUPS_DEFAULT_PARAMS = {"$top": "1", "$skip": "0"}
+API_RESPONSE_MESSAGE_KEY = "message"
+AUTH_TOKEN_MAX_RETRIES = 5
+AUTH_TOKEN_RETRY_WAIT = 120
# Similar inner methods with mode client. That's fine.
# pylint: disable=duplicate-code
class PowerBiApiClient:
@@ -59,6 +62,9 @@ class PowerBiApiClient:
def __init__(self, config: PowerBIConnection):
self.config = config
+ self.pagination_entity_per_page = min(
+ 100, self.config.pagination_entity_per_page
+ )
self.msal_client = msal.ConfidentialClientApplication(
client_id=self.config.clientId,
client_credential=self.config.clientSecret.get_secret_value(),
@@ -82,42 +88,84 @@ def get_auth_token(self) -> Tuple[str, str]:
"""
logger.info("Generating PowerBi access token")
- response_data = self.msal_client.acquire_token_silent(
- scopes=self.config.scope, account=None
- )
-
+ response_data = self.get_auth_token_from_cache()
if not response_data:
logger.info("Token does not exist in the cache. Getting a new token.")
- response_data = self.msal_client.acquire_token_for_client(
- scopes=self.config.scope
- )
+ response_data = self.generate_new_auth_token()
+ response_data = response_data or {}
auth_response = PowerBiToken(**response_data)
if not auth_response.access_token:
raise InvalidSourceException(
- "Failed to generate the PowerBi access token. Please check provided config"
+ f"Failed to generate the PowerBi access token. Please check provided config {response_data}"
)
logger.info("PowerBi Access Token generated successfully")
return auth_response.access_token, auth_response.expires_in
+ def generate_new_auth_token(self) -> Optional[dict]:
+ """generate new auth token"""
+ retry = AUTH_TOKEN_MAX_RETRIES
+ while retry:
+ try:
+ response_data = self.msal_client.acquire_token_for_client(
+ scopes=self.config.scope
+ )
+ return response_data
+ except Exception as exc:
+ logger.debug(traceback.format_exc())
+ logger.warning(f"Error generating new auth token: {exc}")
+ # wait for time and retry
+ retry -= 1
+ if retry:
+ logger.warning(
+ f"Error generating new token: {exc}, "
+ f"sleep {AUTH_TOKEN_RETRY_WAIT} seconds retrying {retry} more times.."
+ )
+ sleep(AUTH_TOKEN_RETRY_WAIT)
+ else:
+ logger.warning(
+ "Could not generate new token after maximum retries, "
+ "Please check provided configs"
+ )
+ return None
+
+ def get_auth_token_from_cache(self) -> Optional[dict]:
+ """fetch auth token from cache"""
+ retry = AUTH_TOKEN_MAX_RETRIES
+ while retry:
+ try:
+ response_data = self.msal_client.acquire_token_silent(
+ scopes=self.config.scope, account=None
+ )
+ return response_data
+ except Exception as exc:
+ logger.debug(traceback.format_exc())
+ logger.warning(f"Error getting token from cache: {exc}")
+ retry -= 1
+ if retry:
+ logger.warning(
+ f"Error getting token from cache: {exc}, "
+ f"sleep {AUTH_TOKEN_RETRY_WAIT} seconds retrying {retry} more times.."
+ )
+ sleep(AUTH_TOKEN_RETRY_WAIT)
+ else:
+ logger.warning(
+ "Could not get token from cache after maximum retries, "
+ "Please check provided configs"
+ )
+ return None
+
def fetch_dashboards(self) -> Optional[List[PowerBIDashboard]]:
"""Get dashboards method
Returns:
List[PowerBIDashboard]
"""
- try:
- if self.config.useAdminApis:
- response_data = self.client.get("/myorg/admin/dashboards")
- response = DashboardsResponse(**response_data)
- return response.value
- group = self.fetch_all_workspaces()[0]
- return self.fetch_all_org_dashboards(group_id=group.id)
-
- except Exception as exc: # pylint: disable=broad-except
- logger.debug(traceback.format_exc())
- logger.warning(f"Error fetching dashboards: {exc}")
-
- return None
+ if self.config.useAdminApis:
+ response_data = self.client.get("/myorg/admin/dashboards")
+ response = DashboardsResponse(**response_data)
+ return response.value
+ group = self.fetch_all_workspaces()[0]
+ return self.fetch_all_org_dashboards(group_id=group.id)
def fetch_all_org_dashboards(
self, group_id: str
@@ -205,6 +253,7 @@ def fetch_dataset_tables(
return None
+ # pylint: disable=too-many-branches,too-many-statements
def fetch_all_workspaces(self) -> Optional[List[Group]]:
"""Method to fetch all powerbi workspace details
Returns:
@@ -213,28 +262,94 @@ def fetch_all_workspaces(self) -> Optional[List[Group]]:
try:
admin = "admin/" if self.config.useAdminApis else ""
api_url = f"/myorg/{admin}groups"
- entities_per_page = self.config.pagination_entity_per_page
- params_data = {"$top": "1"}
- response_data = self.client.get(api_url, data=params_data)
- response = GroupsResponse(**response_data)
- count = response.odata_count
+ entities_per_page = self.pagination_entity_per_page
+ failed_indexes = []
+ params_data = GETGROUPS_DEFAULT_PARAMS
+ response = self.client.get(api_url, data=params_data)
+ if (
+ not response
+ or API_RESPONSE_MESSAGE_KEY in response
+ or len(response) != len(GroupsResponse.__annotations__)
+ ):
+ logger.warning("Error fetching workspaces between results: (0, 1)")
+ if response and response.get(API_RESPONSE_MESSAGE_KEY):
+ logger.warning(
+ "Error message from API response: "
+ f"{str(response.get(API_RESPONSE_MESSAGE_KEY))}"
+ )
+ failed_indexes.append(params_data)
+ count = 0
+ else:
+ try:
+ response = GroupsResponse(**response)
+ count = response.odata_count
+ except Exception as exc:
+ logger.warning(f"Error processing GetGroups response: {exc}")
+ count = 0
indexes = math.ceil(count / entities_per_page)
-
workspaces = []
for index in range(indexes):
params_data = {
"$top": str(entities_per_page),
"$skip": str(index * entities_per_page),
}
- response_data = self.client.get(api_url, data=params_data)
- if not response_data:
- logger.error(
- "Error fetching workspaces between results: "
- f"{str(index * entities_per_page)} - {str(entities_per_page)}"
+ response = self.client.get(api_url, data=params_data)
+ if (
+ not response
+ or API_RESPONSE_MESSAGE_KEY in response
+ or len(response) != len(GroupsResponse.__annotations__)
+ ):
+ index_range = (
+ int(params_data.get("$skip")),
+ int(params_data.get("$skip")) + int(params_data.get("$top")),
+ )
+ logger.warning(
+ f"Error fetching workspaces between results: {str(index_range)}"
)
+ if response and response.get(API_RESPONSE_MESSAGE_KEY):
+ logger.warning(
+ "Error message from API response: "
+ f"{str(response.get(API_RESPONSE_MESSAGE_KEY))}"
+ )
+ failed_indexes.append(params_data)
continue
- response = GroupsResponse(**response_data)
- workspaces.extend(response.value)
+ try:
+ response = GroupsResponse(**response)
+ workspaces.extend(response.value)
+ except Exception as exc:
+ logger.warning(f"Error processing GetGroups response: {exc}")
+
+ if failed_indexes:
+ logger.info(
+ "Retrying one more time on failed indexes to get workspaces"
+ )
+ for params_data in failed_indexes:
+ response = self.client.get(api_url, data=params_data)
+ if (
+ not response
+ or API_RESPONSE_MESSAGE_KEY in response
+ or len(response) != len(GroupsResponse.__annotations__)
+ ):
+ index_range = (
+ int(params_data.get("$skip")),
+ int(params_data.get("$skip"))
+ + int(params_data.get("$top")),
+ )
+ logger.warning(
+ f"Workspaces between results {str(index_range)} "
+ "could not be fetched on multiple attempts"
+ )
+ if response and response.get(API_RESPONSE_MESSAGE_KEY):
+ logger.warning(
+ "Error message from API response: "
+ f"{str(response.get(API_RESPONSE_MESSAGE_KEY))}"
+ )
+ continue
+ try:
+ response = GroupsResponse(**response)
+ workspaces.extend(response.value)
+ except Exception as exc:
+ logger.warning(f"Error processing GetGroups response: {exc}")
return workspaces
except Exception as exc: # pylint: disable=broad-except
logger.debug(traceback.format_exc())
diff --git a/ingestion/src/metadata/ingestion/source/dashboard/superset/db_source.py b/ingestion/src/metadata/ingestion/source/dashboard/superset/db_source.py
index ff69eea373f6..7896ab23a631 100644
--- a/ingestion/src/metadata/ingestion/source/dashboard/superset/db_source.py
+++ b/ingestion/src/metadata/ingestion/source/dashboard/superset/db_source.py
@@ -212,6 +212,7 @@ def _get_database_name(
if sqa_str:
sqa_url = make_url(sqa_str)
default_db_name = sqa_url.database if sqa_url else None
+
return get_database_name_for_lineage(db_service_entity, default_db_name)
def _get_datasource_fqn(
diff --git a/ingestion/src/metadata/ingestion/source/dashboard/superset/mixin.py b/ingestion/src/metadata/ingestion/source/dashboard/superset/mixin.py
index 718cffb0b833..8c5b0687cc17 100644
--- a/ingestion/src/metadata/ingestion/source/dashboard/superset/mixin.py
+++ b/ingestion/src/metadata/ingestion/source/dashboard/superset/mixin.py
@@ -13,7 +13,10 @@
"""
import json
import traceback
-from typing import Iterable, List, Optional, Union
+from typing import Dict, Iterable, List, Optional, Tuple, Union
+
+from collate_sqllineage.core.models import Column as LineageColumn
+from collate_sqllineage.core.models import Table as LineageTable
from metadata.generated.schema.api.lineage.addLineage import AddLineageRequest
from metadata.generated.schema.entity.data.dashboardDataModel import DashboardDataModel
@@ -34,9 +37,12 @@
from metadata.generated.schema.metadataIngestion.workflow import (
Source as WorkflowSource,
)
+from metadata.generated.schema.type.entityLineage import ColumnLineage
from metadata.generated.schema.type.entityReferenceList import EntityReferenceList
from metadata.ingestion.api.models import Either
from metadata.ingestion.api.steps import InvalidSourceException
+from metadata.ingestion.lineage.parser import LineageParser
+from metadata.ingestion.lineage.sql_lineage import get_column_fqn
from metadata.ingestion.ometa.ometa_api import OpenMetadata
from metadata.ingestion.source.dashboard.dashboard_service import DashboardServiceSource
from metadata.ingestion.source.dashboard.superset.models import (
@@ -47,6 +53,9 @@
FetchDashboard,
SupersetDatasource,
)
+from metadata.ingestion.source.dashboard.superset.utils import (
+ get_dashboard_data_model_column_fqn,
+)
from metadata.ingestion.source.database.column_type_parser import ColumnTypeParser
from metadata.utils import fqn
from metadata.utils.logger import ingestion_logger
@@ -146,6 +155,156 @@ def _get_charts_of_dashboard(
)
return []
+ def _is_table_to_table_lineage(self, columns: tuple, table: LineageTable) -> bool:
+ from_column: LineageColumn = columns[0]
+ to_column: LineageColumn = columns[-1]
+
+ if not isinstance(from_column.parent, LineageTable):
+ return False
+
+ if not isinstance(to_column.parent, LineageTable):
+ return False
+
+ if from_column.parent.schema.raw_name != table.schema.raw_name:
+ return False
+
+ if from_column.parent.raw_name != table.raw_name:
+ return False
+
+ return True
+
+ def _append_value_to_dict_list(
+ self, input_dict: Dict[str, List[str]], dict_key: str, list_value: str
+ ) -> None:
+ if input_dict.get(dict_key):
+ input_dict[dict_key].append(list_value)
+ else:
+ input_dict[dict_key] = [list_value]
+
+ def _get_table_schema(self, table: LineageTable, chart: FetchChart) -> str:
+ if table.schema.raw_name == table.schema.unknown:
+ return chart.table_schema
+
+ return table.schema.raw_name
+
+ def _create_column_lineage_mapping(
+ self, parser: LineageParser, table: LineageTable, chart: FetchChart
+ ) -> Dict[str, List[str]]:
+ result = {}
+ table_to_table_lineage = [
+ _columns
+ for _columns in parser.column_lineage
+ if self._is_table_to_table_lineage(_columns, table)
+ ]
+
+ for columns in table_to_table_lineage:
+ from_column_name = columns[0].raw_name
+ to_column_name = columns[-1].raw_name
+
+ if from_column_name != "*" and to_column_name != "*":
+ self._append_value_to_dict_list(
+ result, to_column_name, from_column_name
+ )
+
+ if from_column_name == "*" and to_column_name == "*":
+ for col_name in self._get_columns_list_for_lineage(chart):
+ self._append_value_to_dict_list(result, col_name, col_name)
+
+ return result
+
+ def _parse_lineage_from_dataset_sql(
+ self, chart_json: FetchChart
+ ) -> List[Tuple[FetchChart, Dict[str, List[str]]]]:
+ # Every SQL query in tables is a SQL statement SELECTING data.
+ # To get lineage we 'simulate' INSERT INTO query into dummy table.
+ result = []
+ parser = LineageParser(f"INSERT INTO dummy_table {chart_json.sql}")
+
+ for table in parser.source_tables:
+ table_name = table.raw_name
+ table_schema = self._get_table_schema(table, chart_json)
+
+ column_mapping: Dict[str, List[str]] = self._create_column_lineage_mapping(
+ parser, table, chart_json
+ )
+
+ result.append(
+ (
+ FetchChart(
+ table_name=table_name,
+ schema=table_schema,
+ sqlalchemy_uri=chart_json.sqlalchemy_uri,
+ ),
+ column_mapping,
+ )
+ )
+
+ return result
+
+ def _enrich_raw_input_tables(
+ self,
+ from_entities: List[Tuple[FetchChart, Dict[str, List[str]]]],
+ to_entity: DashboardDataModel,
+ db_service_entity: DatabaseService,
+ ):
+ result = []
+
+ for from_entity in from_entities:
+ input_table, _column_lineage = from_entity
+ datasource_fqn = self._get_datasource_fqn_for_lineage(
+ input_table, db_service_entity
+ )
+ from_entity = self.metadata.get_by_name(
+ entity=Table,
+ fqn=datasource_fqn,
+ )
+
+ column_lineage: List[ColumnLineage] = []
+ for to_column, from_columns in _column_lineage.items():
+ _from_columns = [
+ get_column_fqn(from_entity, from_column)
+ for from_column in from_columns
+ if get_column_fqn(from_entity, from_column)
+ ]
+
+ _to_column = get_dashboard_data_model_column_fqn(to_entity, to_column)
+
+ if _from_columns and _to_column:
+ column_lineage.append(
+ ColumnLineage(
+ fromColumns=_from_columns,
+ toColumn=_to_column,
+ )
+ )
+
+ result.append((from_entity, column_lineage))
+
+ return result
+
+ def _get_input_tables(self, chart: FetchChart):
+ if chart.sql:
+ result = self._parse_lineage_from_dataset_sql(chart)
+ else:
+ result = [
+ (chart, {c: [c] for c in self._get_columns_list_for_lineage(chart)})
+ ]
+
+ return result
+
+ def _get_dashboard_data_model_entity(
+ self, chart: FetchChart
+ ) -> Optional[DashboardDataModel]:
+ datamodel_fqn = fqn.build(
+ self.metadata,
+ entity_type=DashboardDataModel,
+ service_name=self.config.serviceName,
+ data_model_name=str(chart.datasource_id),
+ )
+ return self.metadata.get_by_name(
+ entity=DashboardDataModel,
+ fqn=datamodel_fqn,
+ )
+
def yield_dashboard_lineage_details(
self,
dashboard_details: Union[FetchDashboard, DashboardResult],
@@ -158,51 +317,40 @@ def yield_dashboard_lineage_details(
entity=DatabaseService, fqn=db_service_name
)
if db_service_entity:
- for chart_id in self._get_charts_of_dashboard(dashboard_details):
- chart_json = self.all_charts.get(chart_id)
- if chart_json:
- try:
- datasource_fqn = self._get_datasource_fqn_for_lineage(
- chart_json, db_service_entity
- )
- if not datasource_fqn:
- continue
- from_entity = self.metadata.get_by_name(
- entity=Table,
- fqn=datasource_fqn,
- )
- datamodel_fqn = fqn.build(
- self.metadata,
- entity_type=DashboardDataModel,
- service_name=self.config.serviceName,
- data_model_name=str(chart_json.datasource_id),
- )
- to_entity = self.metadata.get_by_name(
- entity=DashboardDataModel,
- fqn=datamodel_fqn,
- )
+ for chart_json in filter(
+ None,
+ [
+ self.all_charts.get(chart_id)
+ for chart_id in self._get_charts_of_dashboard(dashboard_details)
+ ],
+ ):
+ try:
+ to_entity = self._get_dashboard_data_model_entity(chart_json)
- columns_list = self._get_columns_list_for_lineage(chart_json)
- column_lineage = self._get_column_lineage(
- from_entity, to_entity, columns_list
+ if to_entity:
+ _input_tables = self._get_input_tables(chart_json)
+ input_tables = self._enrich_raw_input_tables(
+ _input_tables, to_entity, db_service_entity
)
- if from_entity and to_entity:
+ for input_table in input_tables:
+ from_entity_table, column_lineage = input_table
+
yield self._get_add_lineage_request(
to_entity=to_entity,
- from_entity=from_entity,
+ from_entity=from_entity_table,
column_lineage=column_lineage,
)
- except Exception as exc:
- yield Either(
- left=StackTraceError(
- name=db_service_name,
- error=(
- "Error to yield dashboard lineage details for DB "
- f"service name [{db_service_name}]: {exc}"
- ),
- stackTrace=traceback.format_exc(),
- )
+ except Exception as exc:
+ yield Either(
+ left=StackTraceError(
+ name=db_service_name,
+ error=(
+ "Error to yield dashboard lineage details for DB "
+ f"service name [{db_service_name}]: {exc}"
+ ),
+ stackTrace=traceback.format_exc(),
)
+ )
def _get_datamodel(
self, datamodel: Union[SupersetDatasource, FetchChart]
@@ -238,6 +386,18 @@ def parse_array_data_type(self, col_parse: dict) -> Optional[str]:
return DataType(col_parse["arrayDataType"])
return None
+ def parse_row_data_type(self, col_parse: dict) -> List[Column]:
+ """
+ Set children to single UNKNOWN column for Trino row columns
+ to prevent validation error requiring non empty list of children.
+ """
+ if col_parse["dataType"] == "ROW" and not col_parse.get("children"):
+ return [Column(name="unknown", dataType=DataType.UNKNOWN)]
+
+ if col_parse.get("children"):
+ return col_parse["children"]
+ return []
+
def get_column_info(
self, data_source: List[Union[DataSourceResult, FetchColumn]]
) -> Optional[List[Column]]:
@@ -259,9 +419,7 @@ def get_column_info(
dataTypeDisplay=field.type,
dataType=col_parse["dataType"],
arrayDataType=self.parse_array_data_type(col_parse),
- children=list(col_parse["children"])
- if col_parse.get("children")
- else None,
+ children=self.parse_row_data_type(col_parse),
name=str(field.id),
displayName=field.column_name,
description=field.description,
diff --git a/ingestion/src/metadata/ingestion/source/dashboard/superset/models.py b/ingestion/src/metadata/ingestion/source/dashboard/superset/models.py
index d9d40e1214ad..15bae1c75d12 100644
--- a/ingestion/src/metadata/ingestion/source/dashboard/superset/models.py
+++ b/ingestion/src/metadata/ingestion/source/dashboard/superset/models.py
@@ -151,6 +151,7 @@ class FetchChart(BaseModel):
sqlalchemy_uri: Optional[str] = None
viz_type: Optional[str] = None
datasource_id: Optional[int] = None
+ sql: Optional[str] = None
class FetchColumn(BaseModel):
diff --git a/ingestion/src/metadata/ingestion/source/dashboard/superset/queries.py b/ingestion/src/metadata/ingestion/source/dashboard/superset/queries.py
index 144c5322f749..b333f2b946ab 100644
--- a/ingestion/src/metadata/ingestion/source/dashboard/superset/queries.py
+++ b/ingestion/src/metadata/ingestion/source/dashboard/superset/queries.py
@@ -23,6 +23,7 @@
t.id AS table_id,
t.table_name,
t.schema,
+ t.sql,
db.database_name,
db.sqlalchemy_uri
from
@@ -92,10 +93,10 @@
FETCH_COLUMN = """
select
tc.id,
- table_name ,
- column_name,
- table_id,
- type,
+ tc.table_name ,
+ tc.column_name,
+ tc.table_id,
+ tc.type,
tc.description
from
table_columns tc
@@ -104,5 +105,5 @@
on
t.id=tc.table_id
where
- table_id=%(table_id)s
+ tc.table_id=%(table_id)s
"""
diff --git a/ingestion/src/metadata/ingestion/source/dashboard/superset/utils.py b/ingestion/src/metadata/ingestion/source/dashboard/superset/utils.py
new file mode 100644
index 000000000000..3c1eb910b6a4
--- /dev/null
+++ b/ingestion/src/metadata/ingestion/source/dashboard/superset/utils.py
@@ -0,0 +1,36 @@
+# Copyright 2021 Collate
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Superset utils module
+"""
+
+from typing import Optional
+
+from metadata.generated.schema.entity.data.dashboardDataModel import DashboardDataModel
+
+
+def get_dashboard_data_model_column_fqn(
+ dashboard_data_model_entity: DashboardDataModel, column: str
+) -> Optional[str]:
+ """
+ Get fqn of column if exist in dashboard data model entity.
+
+ This is Superset implementation specific as table name is stored within displayName (table name contains
+ numerical id), which is not consistent with implementations of dashboard data model columns of
+ other dashboard sources.
+ """
+ if not dashboard_data_model_entity:
+ return None
+ for dashboard_data_model_column in dashboard_data_model_entity.columns:
+ if column.lower() == dashboard_data_model_column.displayName.lower():
+ return dashboard_data_model_column.fullyQualifiedName.root
+
+ return None
diff --git a/ingestion/src/metadata/ingestion/source/database/bigquery/metadata.py b/ingestion/src/metadata/ingestion/source/database/bigquery/metadata.py
index 98480f6a4d46..6fdfd2de4733 100644
--- a/ingestion/src/metadata/ingestion/source/database/bigquery/metadata.py
+++ b/ingestion/src/metadata/ingestion/source/database/bigquery/metadata.py
@@ -34,9 +34,11 @@
from metadata.generated.schema.entity.data.databaseSchema import DatabaseSchema
from metadata.generated.schema.entity.data.storedProcedure import StoredProcedureCode
from metadata.generated.schema.entity.data.table import (
+ ConstraintType,
PartitionColumnDetails,
PartitionIntervalTypes,
Table,
+ TableConstraint,
TablePartition,
TableType,
)
@@ -96,6 +98,7 @@
from metadata.ingestion.source.database.multi_db_source import MultiDBSource
from metadata.utils import fqn
from metadata.utils.credentials import GOOGLE_CREDENTIALS
+from metadata.utils.execution_time_tracker import calculate_execution_time
from metadata.utils.filters import filter_by_database, filter_by_schema
from metadata.utils.helpers import retry_with_docker_host
from metadata.utils.logger import ingestion_logger
@@ -661,6 +664,42 @@ def _get_partition_column_name(
)
return None
+ @calculate_execution_time()
+ def update_table_constraints(
+ self,
+ table_name,
+ schema_name,
+ db_name,
+ table_constraints,
+ foreign_columns,
+ columns,
+ ) -> List[TableConstraint]:
+ """
+ From topology.
+ process the table constraints of all tables
+ """
+ table_constraints = super().update_table_constraints(
+ table_name,
+ schema_name,
+ db_name,
+ table_constraints,
+ foreign_columns,
+ columns,
+ )
+ try:
+ table = self.client.get_table(fqn._build(db_name, schema_name, table_name))
+ if hasattr(table, "clustering_fields") and table.clustering_fields:
+ table_constraints.append(
+ TableConstraint(
+ constraintType=ConstraintType.CLUSTER_KEY,
+ columns=table.clustering_fields,
+ )
+ )
+ except Exception as exc:
+ logger.warning(f"Error getting clustering fields for {table_name}: {exc}")
+ logger.debug(traceback.format_exc())
+ return table_constraints
+
def get_table_partition_details(
self, table_name: str, schema_name: str, inspector: Inspector
) -> Tuple[bool, Optional[TablePartition]]:
@@ -671,8 +710,10 @@ def get_table_partition_details(
database = self.context.get().database
table = self.client.get_table(fqn._build(database, schema_name, table_name))
columns = inspector.get_columns(table_name, schema_name, db_name=database)
- if hasattr(table, "external_data_configuration") and hasattr(
- table.external_data_configuration, "hive_partitioning"
+ if (
+ hasattr(table, "external_data_configuration")
+ and hasattr(table.external_data_configuration, "hive_partitioning")
+ and table.external_data_configuration.hive_partitioning
):
# Ingesting External Hive Partitioned Tables
from google.cloud.bigquery.external_config import ( # pylint: disable=import-outside-toplevel
@@ -739,6 +780,30 @@ def get_table_partition_details(
table_partition.interval = table.range_partitioning.range_.interval
table_partition.columnName = table.range_partitioning.field
return True, TablePartition(columns=[table_partition])
+ if (
+ hasattr(table, "_properties")
+ and table._properties.get("partitionDefinition")
+ and table._properties.get("partitionDefinition").get(
+ "partitionedColumn"
+ )
+ ):
+
+ return True, TablePartition(
+ columns=[
+ PartitionColumnDetails(
+ columnName=self._get_partition_column_name(
+ columns=columns,
+ partition_field_name=field.get("field"),
+ ),
+ intervalType=PartitionIntervalTypes.OTHER,
+ )
+ for field in table._properties.get("partitionDefinition").get(
+ "partitionedColumn"
+ )
+ if field and field.get("field")
+ ]
+ )
+
except Exception as exc:
logger.debug(traceback.format_exc())
logger.warning(
diff --git a/ingestion/src/metadata/ingestion/source/database/common_db_source.py b/ingestion/src/metadata/ingestion/source/database/common_db_source.py
index 1bc9ba40cb25..2ee49a4a9ccd 100644
--- a/ingestion/src/metadata/ingestion/source/database/common_db_source.py
+++ b/ingestion/src/metadata/ingestion/source/database/common_db_source.py
@@ -195,6 +195,12 @@ def get_schema_description(self, schema_name: str) -> Optional[str]:
by default there will be no schema description
"""
+ def get_stored_procedure_description(self, stored_procedure: str) -> Optional[str]:
+ """
+ Method to fetch the stored procedure description
+ by default there will be no stored procedure description
+ """
+
@calculate_execution_time_generator()
def yield_database(
self, database_name: str
diff --git a/ingestion/src/metadata/ingestion/source/database/databricks/connection.py b/ingestion/src/metadata/ingestion/source/database/databricks/connection.py
index 14d3d3e392e9..7bae5d4b7f8a 100644
--- a/ingestion/src/metadata/ingestion/source/database/databricks/connection.py
+++ b/ingestion/src/metadata/ingestion/source/database/databricks/connection.py
@@ -38,9 +38,9 @@
test_connection_steps,
)
from metadata.ingestion.ometa.ometa_api import OpenMetadata
-from metadata.ingestion.source.database.databricks.client import DatabricksClient
from metadata.ingestion.source.database.databricks.queries import (
DATABRICKS_GET_CATALOGS,
+ DATABRICKS_SQL_STATEMENT_TEST,
)
from metadata.utils.constants import THREE_MIN
from metadata.utils.logger import ingestion_logger
@@ -81,7 +81,6 @@ def test_connection(
Test connection. This can be executed either as part
of a metadata workflow or during an Automation Workflow
"""
- client = DatabricksClient(service_connection)
def test_database_query(engine: Engine, statement: str):
"""
@@ -106,7 +105,13 @@ def test_database_query(engine: Engine, statement: str):
engine=connection,
statement=DATABRICKS_GET_CATALOGS,
),
- "GetQueries": client.test_query_api_access,
+ "GetQueries": partial(
+ test_database_query,
+ engine=connection,
+ statement=DATABRICKS_SQL_STATEMENT_TEST.format(
+ query_history=service_connection.queryHistoryTable
+ ),
+ ),
}
return test_connection_steps(
diff --git a/ingestion/src/metadata/ingestion/source/database/databricks/lineage.py b/ingestion/src/metadata/ingestion/source/database/databricks/lineage.py
index a77cb780e56d..eb4b74b5d495 100644
--- a/ingestion/src/metadata/ingestion/source/database/databricks/lineage.py
+++ b/ingestion/src/metadata/ingestion/source/database/databricks/lineage.py
@@ -11,12 +11,10 @@
"""
Databricks lineage module
"""
-import traceback
-from datetime import datetime
-from typing import Iterator
-from metadata.generated.schema.type.basic import DateTime
-from metadata.generated.schema.type.tableQuery import TableQuery
+from metadata.ingestion.source.database.databricks.queries import (
+ DATABRICKS_SQL_STATEMENT,
+)
from metadata.ingestion.source.database.databricks.query_parser import (
DatabricksQueryParserSource,
)
@@ -31,23 +29,13 @@ class DatabricksLineageSource(DatabricksQueryParserSource, LineageSource):
Databricks Lineage Legacy Source
"""
- def yield_table_query(self) -> Iterator[TableQuery]:
- data = self.client.list_query_history(
- start_date=self.start,
- end_date=self.end,
+ sql_stmt = DATABRICKS_SQL_STATEMENT
+
+ filters = """
+ AND (
+ lower(statement_text) LIKE '%%create%%select%%'
+ OR lower(statement_text) LIKE '%%insert%%into%%select%%'
+ OR lower(statement_text) LIKE '%%update%%'
+ OR lower(statement_text) LIKE '%%merge%%'
)
- for row in data or []:
- try:
- if self.client.is_query_valid(row):
- yield TableQuery(
- dialect=self.dialect.value,
- query=row.get("query_text"),
- userName=row.get("user_name"),
- startTime=str(row.get("query_start_time_ms")),
- endTime=str(row.get("execution_end_time_ms")),
- analysisDate=DateTime(datetime.now()),
- serviceName=self.config.serviceName,
- )
- except Exception as exc:
- logger.debug(traceback.format_exc())
- logger.warning(f"Error processing query_dict {row}: {exc}")
+ """
diff --git a/ingestion/src/metadata/ingestion/source/database/databricks/queries.py b/ingestion/src/metadata/ingestion/source/database/databricks/queries.py
index 732dc79ad685..25cdcedfc105 100644
--- a/ingestion/src/metadata/ingestion/source/database/databricks/queries.py
+++ b/ingestion/src/metadata/ingestion/source/database/databricks/queries.py
@@ -14,6 +14,30 @@
import textwrap
+DATABRICKS_SQL_STATEMENT = textwrap.dedent(
+ """
+ SELECT
+ statement_type AS query_type,
+ statement_text AS query_text,
+ executed_by AS user_name,
+ start_time AS start_time,
+ null AS database_name,
+ null AS schema_name,
+ end_time AS end_time,
+ total_duration_ms/1000 AS duration
+ from {query_history}
+ WHERE statement_text NOT LIKE '/* {{"app": "OpenMetadata", %%}} */%%'
+ AND statement_text NOT LIKE '/* {{"app": "dbt", %%}} */%%'
+ AND start_time between to_timestamp('{start_time}') and to_timestamp('{end_time}')
+ {filters}
+ LIMIT {result_limit}
+ """
+)
+
+DATABRICKS_SQL_STATEMENT_TEST = """
+ SELECT statement_text from {query_history} LIMIT 1
+"""
+
DATABRICKS_VIEW_DEFINITIONS = textwrap.dedent(
"""
select
diff --git a/ingestion/src/metadata/ingestion/source/database/databricks/query_parser.py b/ingestion/src/metadata/ingestion/source/database/databricks/query_parser.py
index 00628bfbddaa..c67b06aa30ed 100644
--- a/ingestion/src/metadata/ingestion/source/database/databricks/query_parser.py
+++ b/ingestion/src/metadata/ingestion/source/database/databricks/query_parser.py
@@ -22,7 +22,6 @@
)
from metadata.ingestion.api.steps import InvalidSourceException
from metadata.ingestion.ometa.ometa_api import OpenMetadata
-from metadata.ingestion.source.database.databricks.client import DatabricksClient
from metadata.ingestion.source.database.query_parser_source import QueryParserSource
from metadata.utils.logger import ingestion_logger
@@ -36,18 +35,6 @@ class DatabricksQueryParserSource(QueryParserSource, ABC):
filters: str
- def _init_super(
- self,
- config: WorkflowSource,
- metadata: OpenMetadata,
- ):
- super().__init__(config, metadata, False)
-
- # pylint: disable=super-init-not-called
- def __init__(self, config: WorkflowSource, metadata: OpenMetadata):
- self._init_super(config=config, metadata=metadata)
- self.client = DatabricksClient(self.service_connection)
-
@classmethod
def create(
cls, config_dict, metadata: OpenMetadata, pipeline_name: Optional[str] = None
@@ -61,7 +48,16 @@ def create(
)
return cls(config, metadata)
- def prepare(self):
+ def get_sql_statement(self, start_time, end_time):
"""
- By default, there's nothing to prepare
+ returns sql statement to fetch query logs.
+
+ Override if we have specific parameters
"""
+ return self.sql_stmt.format(
+ start_time=start_time,
+ end_time=end_time,
+ filters=self.get_filters(),
+ result_limit=self.source_config.resultLimit,
+ query_history=self.service_connection.queryHistoryTable,
+ )
diff --git a/ingestion/src/metadata/ingestion/source/database/databricks/usage.py b/ingestion/src/metadata/ingestion/source/database/databricks/usage.py
index 0e5364a465d8..fedbab2da486 100644
--- a/ingestion/src/metadata/ingestion/source/database/databricks/usage.py
+++ b/ingestion/src/metadata/ingestion/source/database/databricks/usage.py
@@ -11,12 +11,10 @@
"""
Databricks usage module
"""
-import traceback
-from datetime import datetime
-from typing import Iterable
-from metadata.generated.schema.type.basic import DateTime
-from metadata.generated.schema.type.tableQuery import TableQueries, TableQuery
+from metadata.ingestion.source.database.databricks.queries import (
+ DATABRICKS_SQL_STATEMENT,
+)
from metadata.ingestion.source.database.databricks.query_parser import (
DatabricksQueryParserSource,
)
@@ -31,36 +29,8 @@ class DatabricksUsageSource(DatabricksQueryParserSource, UsageSource):
Databricks Usage Source
"""
- def yield_table_queries(self) -> Iterable[TableQuery]:
- """
- Method to yield TableQueries
- """
- queries = []
- data = self.client.list_query_history(
- start_date=self.start,
- end_date=self.end,
- )
- for row in data or []:
- try:
- if self.client.is_query_valid(row):
- queries.append(
- TableQuery(
- dialect=self.dialect.value,
- query=row.get("query_text"),
- userName=row.get("user_name"),
- startTime=str(row.get("query_start_time_ms")),
- endTime=str(row.get("execution_end_time_ms")),
- analysisDate=DateTime(datetime.now()),
- serviceName=self.config.serviceName,
- duration=row.get("duration")
- if row.get("duration")
- else None,
- )
- )
- except Exception as err:
- logger.debug(traceback.format_exc())
- logger.warning(
- f"Failed to process query {row.get('query_text')} due to: {err}"
- )
+ sql_stmt = DATABRICKS_SQL_STATEMENT
- yield TableQueries(queries=queries)
+ filters = """
+ AND statement_type NOT IN ('SHOW', 'DESCRIBE', 'USE')
+ """
diff --git a/ingestion/src/metadata/ingestion/source/database/db2/connection.py b/ingestion/src/metadata/ingestion/source/database/db2/connection.py
index b25ac3efc757..c9efc348f730 100644
--- a/ingestion/src/metadata/ingestion/source/database/db2/connection.py
+++ b/ingestion/src/metadata/ingestion/source/database/db2/connection.py
@@ -50,7 +50,7 @@ def get_connection(connection: Db2Connection) -> Engine:
"w",
encoding=UTF_8,
) as file:
- file.write(connection.license)
+ file.write(connection.license.encode(UTF_8).decode("unicode-escape"))
return create_generic_db_connection(
connection=connection,
diff --git a/ingestion/src/metadata/ingestion/source/database/dbt/metadata.py b/ingestion/src/metadata/ingestion/source/database/dbt/metadata.py
index adb8c689278a..a422fd6e4902 100644
--- a/ingestion/src/metadata/ingestion/source/database/dbt/metadata.py
+++ b/ingestion/src/metadata/ingestion/source/database/dbt/metadata.py
@@ -1093,7 +1093,9 @@ def add_dbt_test_result(self, dbt_test: dict):
# Create the test case result object
test_case_result = TestCaseResult(
- timestamp=Timestamp(datetime_to_timestamp(dbt_timestamp)),
+ timestamp=Timestamp(
+ datetime_to_timestamp(dbt_timestamp, milliseconds=True)
+ ),
testCaseStatus=test_case_status,
testResultValue=[
TestResultValue(
diff --git a/ingestion/src/metadata/ingestion/source/database/mssql/metadata.py b/ingestion/src/metadata/ingestion/source/database/mssql/metadata.py
index f3f9a91f26e9..1639363f7d50 100644
--- a/ingestion/src/metadata/ingestion/source/database/mssql/metadata.py
+++ b/ingestion/src/metadata/ingestion/source/database/mssql/metadata.py
@@ -30,7 +30,7 @@
from metadata.generated.schema.metadataIngestion.workflow import (
Source as WorkflowSource,
)
-from metadata.generated.schema.type.basic import EntityName
+from metadata.generated.schema.type.basic import EntityName, Markdown
from metadata.ingestion.api.models import Either
from metadata.ingestion.api.steps import InvalidSourceException
from metadata.ingestion.ometa.ometa_api import OpenMetadata
@@ -41,6 +41,7 @@
)
from metadata.ingestion.source.database.mssql.queries import (
MSSQL_GET_DATABASE,
+ MSSQL_GET_STORED_PROCEDURE_COMMENTS,
MSSQL_GET_STORED_PROCEDURES,
)
from metadata.ingestion.source.database.mssql.utils import (
@@ -94,6 +95,14 @@ class MssqlSource(CommonDbSourceService, MultiDBSource):
Database metadata from MSSQL Source
"""
+ def __init__(
+ self,
+ config,
+ metadata,
+ ):
+ super().__init__(config, metadata)
+ self.stored_procedure_desc_map = {}
+
@classmethod
def create(
cls, config_dict, metadata: OpenMetadata, pipeline_name: Optional[str] = None
@@ -107,6 +116,27 @@ def create(
)
return cls(config, metadata)
+ def get_stored_procedure_description(self, stored_procedure: str) -> Optional[str]:
+ """
+ Method to fetch the stored procedure description
+ """
+ description = self.stored_procedure_desc_map.get(
+ (
+ self.context.get().database,
+ self.context.get().database_schema,
+ stored_procedure,
+ )
+ )
+ return Markdown(description) if description else None
+
+ def set_stored_procedure_description_map(self) -> None:
+ self.stored_procedure_desc_map.clear()
+ results = self.engine.execute(MSSQL_GET_STORED_PROCEDURE_COMMENTS).all()
+ self.stored_procedure_desc_map = {
+ (row.DATABASE_NAME, row.SCHEMA_NAME, row.STORED_PROCEDURE): row.COMMENT
+ for row in results
+ }
+
def get_configured_database(self) -> Optional[str]:
if not self.service_connection.ingestAllDatabases:
return self.service_connection.database
@@ -118,6 +148,7 @@ def get_database_names_raw(self) -> Iterable[str]:
def get_database_names(self) -> Iterable[str]:
if not self.config.serviceConnection.root.config.ingestAllDatabases:
configured_db = self.config.serviceConnection.root.config.database
+ self.set_stored_procedure_description_map()
self.set_inspector(database_name=configured_db)
yield configured_db
else:
@@ -178,7 +209,9 @@ def yield_stored_procedure(
try:
stored_procedure_request = CreateStoredProcedureRequest(
name=EntityName(stored_procedure.name),
- description=None,
+ description=self.get_stored_procedure_description(
+ stored_procedure.name
+ ),
storedProcedureCode=StoredProcedureCode(
language=STORED_PROC_LANGUAGE_MAP.get(stored_procedure.language),
code=stored_procedure.definition,
diff --git a/ingestion/src/metadata/ingestion/source/database/mssql/queries.py b/ingestion/src/metadata/ingestion/source/database/mssql/queries.py
index 0ed3f06da060..c3f3c5b3a1d4 100644
--- a/ingestion/src/metadata/ingestion/source/database/mssql/queries.py
+++ b/ingestion/src/metadata/ingestion/source/database/mssql/queries.py
@@ -56,6 +56,23 @@
"""
)
+MSSQL_GET_STORED_PROCEDURE_COMMENTS = textwrap.dedent(
+ """
+SELECT
+ DB_NAME() AS DATABASE_NAME,
+ s.name AS SCHEMA_NAME,
+ p.name AS STORED_PROCEDURE,
+ ep.value AS COMMENT
+FROM sys.procedures p
+JOIN sys.schemas s ON p.schema_id = s.schema_id
+LEFT JOIN sys.extended_properties ep
+ ON ep.major_id = p.object_id
+ AND ep.minor_id = 0
+ AND ep.class = 1
+ AND ep.name = 'MS_Description';
+"""
+)
+
MSSQL_ALL_VIEW_DEFINITIONS = textwrap.dedent(
"""
SELECT
diff --git a/ingestion/src/metadata/ingestion/source/database/postgres/usage.py b/ingestion/src/metadata/ingestion/source/database/postgres/usage.py
index 579590b4ec59..522a89a9e2ee 100644
--- a/ingestion/src/metadata/ingestion/source/database/postgres/usage.py
+++ b/ingestion/src/metadata/ingestion/source/database/postgres/usage.py
@@ -15,6 +15,11 @@
from datetime import datetime
from typing import Iterable
+from sqlalchemy.exc import OperationalError
+
+from metadata.generated.schema.entity.services.ingestionPipelines.status import (
+ StackTraceError,
+)
from metadata.generated.schema.type.basic import DateTime
from metadata.generated.schema.type.tableQuery import TableQueries, TableQuery
from metadata.ingestion.source.connections import get_connection
@@ -67,6 +72,16 @@ def process_table_query(self) -> Iterable[TableQueries]:
logger.error(str(err))
if queries:
yield TableQueries(queries=queries)
+
+ except OperationalError as err:
+ self.status.failed(
+ StackTraceError(
+ name="Usage",
+ error=f"Source Usage failed due to - {err}",
+ stackTrace=traceback.format_exc(),
+ )
+ )
+
except Exception as err:
if query:
logger.debug(
diff --git a/ingestion/src/metadata/ingestion/source/database/redshift/metadata.py b/ingestion/src/metadata/ingestion/source/database/redshift/metadata.py
index d1c87bd07289..244d96acce2a 100644
--- a/ingestion/src/metadata/ingestion/source/database/redshift/metadata.py
+++ b/ingestion/src/metadata/ingestion/source/database/redshift/metadata.py
@@ -11,7 +11,6 @@
"""
Redshift source ingestion
"""
-
import re
import traceback
from typing import Iterable, List, Optional, Tuple
@@ -57,6 +56,9 @@
CommonDbSourceService,
TableNameAndType,
)
+from metadata.ingestion.source.database.external_table_lineage_mixin import (
+ ExternalTableLineageMixin,
+)
from metadata.ingestion.source.database.incremental_metadata_extraction import (
IncrementalConfig,
)
@@ -69,6 +71,7 @@
)
from metadata.ingestion.source.database.redshift.models import RedshiftStoredProcedure
from metadata.ingestion.source.database.redshift.queries import (
+ REDSHIFT_EXTERNAL_TABLE_LOCATION,
REDSHIFT_GET_ALL_RELATION_INFO,
REDSHIFT_GET_DATABASE_NAMES,
REDSHIFT_GET_STORED_PROCEDURES,
@@ -121,12 +124,13 @@
RedshiftDialect._get_all_relation_info = ( # pylint: disable=protected-access
_get_all_relation_info
)
-
Inspector.get_all_table_ddls = get_all_table_ddls
Inspector.get_table_ddl = get_table_ddl
-class RedshiftSource(LifeCycleQueryMixin, CommonDbSourceService, MultiDBSource):
+class RedshiftSource(
+ ExternalTableLineageMixin, LifeCycleQueryMixin, CommonDbSourceService, MultiDBSource
+):
"""
Implements the necessary methods to extract
Database metadata from Redshift Source
@@ -146,6 +150,7 @@ def __init__(
self.incremental_table_processor: Optional[
RedshiftIncrementalTableProcessor
] = None
+ self.external_location_map = {}
if self.incremental.enabled:
logger.info(
@@ -168,6 +173,14 @@ def create(
)
return cls(config, metadata, incremental_config)
+ def get_location_path(self, table_name: str, schema_name: str) -> Optional[str]:
+ """
+ Method to fetch the location path of the table
+ """
+ return self.external_location_map.get(
+ (self.context.get().database, schema_name, table_name)
+ )
+
def get_partition_details(self) -> None:
"""
Populate partition details
@@ -275,15 +288,23 @@ def _set_incremental_table_processor(self, database: str):
for schema_name, table_name in self.incremental_table_processor.get_deleted()
)
+ def set_external_location_map(self, database_name: str) -> None:
+ self.external_location_map.clear()
+ results = self.engine.execute(
+ REDSHIFT_EXTERNAL_TABLE_LOCATION.format(database_name=database_name)
+ ).all()
+ self.external_location_map = {
+ (database_name, row.schemaname, row.tablename): row.location
+ for row in results
+ }
+
def get_database_names(self) -> Iterable[str]:
if not self.config.serviceConnection.root.config.ingestAllDatabases:
+ configured_db = self.config.serviceConnection.root.config.database
self.get_partition_details()
-
- self._set_incremental_table_processor(
- self.config.serviceConnection.root.config.database
- )
-
- yield self.config.serviceConnection.root.config.database
+ self._set_incremental_table_processor(configured_db)
+ self.set_external_location_map(configured_db)
+ yield configured_db
else:
for new_database in self.get_database_names_raw():
database_fqn = fqn.build(
@@ -307,9 +328,8 @@ def get_database_names(self) -> Iterable[str]:
try:
self.set_inspector(database_name=new_database)
self.get_partition_details()
-
self._set_incremental_table_processor(new_database)
-
+ self.set_external_location_map(new_database)
yield new_database
except Exception as exc:
logger.debug(traceback.format_exc())
diff --git a/ingestion/src/metadata/ingestion/source/database/redshift/queries.py b/ingestion/src/metadata/ingestion/source/database/redshift/queries.py
index bc840ff1b00d..114ba4e33dd5 100644
--- a/ingestion/src/metadata/ingestion/source/database/redshift/queries.py
+++ b/ingestion/src/metadata/ingestion/source/database/redshift/queries.py
@@ -203,6 +203,11 @@
"""
)
+REDSHIFT_EXTERNAL_TABLE_LOCATION = """
+ SELECT schemaname, tablename, location
+ FROM svv_external_tables
+ where redshift_database_name='{database_name}'
+"""
REDSHIFT_PARTITION_DETAILS = """
select "schema", "table", diststyle
diff --git a/ingestion/src/metadata/ingestion/source/database/snowflake/connection.py b/ingestion/src/metadata/ingestion/source/database/snowflake/connection.py
index e32479fe3987..6c98a201b319 100644
--- a/ingestion/src/metadata/ingestion/source/database/snowflake/connection.py
+++ b/ingestion/src/metadata/ingestion/source/database/snowflake/connection.py
@@ -193,10 +193,18 @@ def test_connection(
engine_wrapper=engine_wrapper,
),
"GetQueries": partial(
- test_query, statement=SNOWFLAKE_TEST_GET_QUERIES, engine=engine
+ test_query,
+ statement=SNOWFLAKE_TEST_GET_QUERIES.format(
+ account_usage=service_connection.accountUsageSchema
+ ),
+ engine=engine,
),
"GetTags": partial(
- test_query, statement=SNOWFLAKE_TEST_FETCH_TAG, engine=engine
+ test_query,
+ statement=SNOWFLAKE_TEST_FETCH_TAG.format(
+ account_usage=service_connection.accountUsageSchema
+ ),
+ engine=engine,
),
}
diff --git a/ingestion/src/metadata/ingestion/source/database/snowflake/lineage.py b/ingestion/src/metadata/ingestion/source/database/snowflake/lineage.py
index 3ef0232b790d..1337635fe14f 100644
--- a/ingestion/src/metadata/ingestion/source/database/snowflake/lineage.py
+++ b/ingestion/src/metadata/ingestion/source/database/snowflake/lineage.py
@@ -62,6 +62,7 @@ def get_stored_procedure_queries_dict(self) -> Dict[str, List[QueryByProcedure]]
start, _ = get_start_and_end(self.source_config.queryLogDuration)
query = self.stored_procedure_query.format(
start_date=start,
+ account_usage=self.service_connection.accountUsageSchema,
)
queries_dict = self.procedure_queries_dict(
query=query,
diff --git a/ingestion/src/metadata/ingestion/source/database/snowflake/metadata.py b/ingestion/src/metadata/ingestion/source/database/snowflake/metadata.py
index da5a7034a26d..f6f026d39137 100644
--- a/ingestion/src/metadata/ingestion/source/database/snowflake/metadata.py
+++ b/ingestion/src/metadata/ingestion/source/database/snowflake/metadata.py
@@ -418,6 +418,7 @@ def yield_tag(
SNOWFLAKE_FETCH_ALL_TAGS.format(
database_name=self.context.get().database,
schema_name=schema_name,
+ account_usage=self.service_connection.accountUsageSchema,
)
)
@@ -431,6 +432,7 @@ def yield_tag(
SNOWFLAKE_FETCH_ALL_TAGS.format(
database_name=f'"{self.context.get().database}"',
schema_name=f'"{self.context.get().database_schema}"',
+ account_usage=self.service_connection.accountUsageSchema,
)
)
except Exception as inner_exc:
@@ -635,6 +637,7 @@ def _get_stored_procedures_internal(
query.format(
database_name=self.context.get().database,
schema_name=self.context.get().database_schema,
+ account_usage=self.service_connection.accountUsageSchema,
)
).all()
for row in results:
diff --git a/ingestion/src/metadata/ingestion/source/database/snowflake/queries.py b/ingestion/src/metadata/ingestion/source/database/snowflake/queries.py
index a0a4b819c0c1..55b2bf909c4a 100644
--- a/ingestion/src/metadata/ingestion/source/database/snowflake/queries.py
+++ b/ingestion/src/metadata/ingestion/source/database/snowflake/queries.py
@@ -25,7 +25,7 @@
start_time "start_time",
end_time "end_time",
total_elapsed_time "duration"
- from snowflake.account_usage.query_history
+ from {account_usage}.query_history
WHERE query_text NOT LIKE '/* {{"app": "OpenMetadata", %%}} */%%'
AND query_text NOT LIKE '/* {{"app": "dbt", %%}} */%%'
AND start_time between to_timestamp_ltz('{start_time}') and to_timestamp_ltz('{end_time}')
@@ -39,7 +39,7 @@
SNOWFLAKE_FETCH_ALL_TAGS = textwrap.dedent(
"""
select TAG_NAME, TAG_VALUE, OBJECT_DATABASE, OBJECT_SCHEMA, OBJECT_NAME, COLUMN_NAME
- from snowflake.account_usage.tag_references
+ from {account_usage}.tag_references
where OBJECT_DATABASE = '{database_name}'
and OBJECT_SCHEMA = '{schema_name}'
"""
@@ -234,11 +234,11 @@
"""
SNOWFLAKE_TEST_FETCH_TAG = """
-select TAG_NAME from snowflake.account_usage.tag_references limit 1
+select TAG_NAME from {account_usage}.tag_references limit 1
"""
SNOWFLAKE_TEST_GET_QUERIES = """
-SELECT query_text from snowflake.account_usage.query_history limit 1
+SELECT query_text from {account_usage}.query_history limit 1
"""
SNOWFLAKE_TEST_GET_TABLES = """
@@ -296,10 +296,10 @@
ARGUMENT_SIGNATURE AS signature,
COMMENT as comment,
'StoredProcedure' as procedure_type
-FROM SNOWFLAKE.ACCOUNT_USAGE.PROCEDURES
+FROM {account_usage}.PROCEDURES
WHERE PROCEDURE_CATALOG = '{database_name}'
AND PROCEDURE_SCHEMA = '{schema_name}'
- AND DELETED IS NOT NULL
+ AND DELETED IS NULL
"""
)
@@ -313,10 +313,10 @@
ARGUMENT_SIGNATURE AS signature,
COMMENT as comment,
'UDF' as procedure_type
-FROM SNOWFLAKE.ACCOUNT_USAGE.FUNCTIONS
+FROM {account_usage}.FUNCTIONS
WHERE FUNCTION_CATALOG = '{database_name}'
AND FUNCTION_SCHEMA = '{schema_name}'
- AND DELETED IS NOT NULL
+ AND DELETED IS NULL
"""
)
@@ -336,7 +336,7 @@
SESSION_ID,
START_TIME,
END_TIME
- FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY SP
+ FROM {account_usage}.QUERY_HISTORY SP
WHERE QUERY_TYPE = 'CALL'
AND START_TIME >= '{start_date}'
AND QUERY_TEXT <> ''
@@ -353,7 +353,7 @@
USER_NAME,
SCHEMA_NAME,
DATABASE_NAME
- FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY SP
+ FROM {account_usage}.QUERY_HISTORY SP
WHERE QUERY_TYPE <> 'CALL'
AND QUERY_TEXT NOT LIKE '/* {{"app": "OpenMetadata", %%}} */%%'
AND QUERY_TEXT NOT LIKE '/* {{"app": "dbt", %%}} */%%'
diff --git a/ingestion/src/metadata/ingestion/source/database/snowflake/query_parser.py b/ingestion/src/metadata/ingestion/source/database/snowflake/query_parser.py
index bbc528fc4c44..363495bd0f87 100644
--- a/ingestion/src/metadata/ingestion/source/database/snowflake/query_parser.py
+++ b/ingestion/src/metadata/ingestion/source/database/snowflake/query_parser.py
@@ -60,6 +60,7 @@ def get_sql_statement(self, start_time: datetime, end_time: datetime) -> str:
end_time=end_time,
result_limit=self.config.sourceConfig.config.resultLimit,
filters=self.get_filters(),
+ account_usage=self.service_connection.accountUsageSchema,
)
def check_life_cycle_query(
diff --git a/ingestion/src/metadata/ingestion/source/database/snowflake/utils.py b/ingestion/src/metadata/ingestion/source/database/snowflake/utils.py
index 4f2e7f007f61..a1660f13758f 100644
--- a/ingestion/src/metadata/ingestion/source/database/snowflake/utils.py
+++ b/ingestion/src/metadata/ingestion/source/database/snowflake/utils.py
@@ -12,10 +12,12 @@
"""
Module to define overriden dialect methods
"""
-
+import operator
+from functools import reduce
from typing import Dict, Optional
import sqlalchemy.types as sqltypes
+from snowflake.sqlalchemy.snowdialect import SnowflakeDialect
from sqlalchemy import exc as sa_exc
from sqlalchemy import util as sa_util
from sqlalchemy.engine import reflection
@@ -52,6 +54,7 @@
get_table_comment_wrapper,
)
+dialect = SnowflakeDialect()
Query = str
QueryMap = Dict[str, Query]
@@ -83,6 +86,20 @@
}
+def _denormalize_quote_join(*idents):
+ ip = dialect.identifier_preparer
+ split_idents = reduce(
+ operator.add,
+ [ip._split_schema_by_dot(ids) for ids in idents if ids is not None],
+ )
+ quoted_identifiers = ip._quote_free_identifiers(*split_idents)
+ normalized_identifiers = (
+ item if item.startswith('"') and item.endswith('"') else f'"{item}"'
+ for item in quoted_identifiers
+ )
+ return ".".join(normalized_identifiers)
+
+
def _quoted_name(entity_name: Optional[str]) -> Optional[str]:
if entity_name:
return fqn.quote_name(entity_name)
@@ -256,17 +273,16 @@ def get_schema_columns(self, connection, schema, **kw):
None, as it is cacheable and is an unexpected return type for this function"""
ans = {}
current_database, _ = self._current_database_schema(connection, **kw)
- full_schema_name = self._denormalize_quote_join(
- current_database, fqn.quote_name(schema)
- )
+ full_schema_name = _denormalize_quote_join(current_database, fqn.quote_name(schema))
try:
schema_primary_keys = self._get_schema_primary_keys(
connection, full_schema_name, **kw
)
+ # removing " " from schema name because schema name is in the WHERE clause of a query
+ table_schema = self.denormalize_name(fqn.unquote_name(schema))
+ table_schema = table_schema.lower() if schema.islower() else table_schema
result = connection.execute(
- text(SNOWFLAKE_GET_SCHEMA_COLUMNS),
- {"table_schema": self.denormalize_name(fqn.unquote_name(schema))}
- # removing " " from schema name because schema name is in the WHERE clause of a query
+ text(SNOWFLAKE_GET_SCHEMA_COLUMNS), {"table_schema": table_schema}
)
except sa_exc.ProgrammingError as p_err:
@@ -362,9 +378,10 @@ def get_pk_constraint(self, connection, table_name, schema=None, **kw):
schema = schema or self.default_schema_name
schema = _quoted_name(entity_name=schema)
current_database, current_schema = self._current_database_schema(connection, **kw)
- full_schema_name = self._denormalize_quote_join(
+ full_schema_name = _denormalize_quote_join(
current_database, schema if schema else current_schema
)
+
return self._get_schema_primary_keys(
connection, self.denormalize_name(full_schema_name), **kw
).get(table_name, {"constrained_columns": [], "name": None})
@@ -378,7 +395,7 @@ def get_foreign_keys(self, connection, table_name, schema=None, **kw):
schema = schema or self.default_schema_name
schema = _quoted_name(entity_name=schema)
current_database, current_schema = self._current_database_schema(connection, **kw)
- full_schema_name = self._denormalize_quote_join(
+ full_schema_name = _denormalize_quote_join(
current_database, schema if schema else current_schema
)
@@ -452,9 +469,10 @@ def get_unique_constraints(self, connection, table_name, schema, **kw):
schema = schema or self.default_schema_name
schema = _quoted_name(entity_name=schema)
current_database, current_schema = self._current_database_schema(connection, **kw)
- full_schema_name = self._denormalize_quote_join(
+ full_schema_name = _denormalize_quote_join(
current_database, schema if schema else current_schema
)
+
return self._get_schema_unique_constraints(
connection, self.denormalize_name(full_schema_name), **kw
).get(table_name, [])
diff --git a/ingestion/src/metadata/ingestion/source/database/trino/connection.py b/ingestion/src/metadata/ingestion/source/database/trino/connection.py
index 6dbae4ac9c39..a87dd792b6bd 100644
--- a/ingestion/src/metadata/ingestion/source/database/trino/connection.py
+++ b/ingestion/src/metadata/ingestion/source/database/trino/connection.py
@@ -39,6 +39,7 @@
create_generic_db_connection,
get_connection_args_common,
init_empty_connection_arguments,
+ init_empty_connection_options,
)
from metadata.ingestion.connections.secrets import connection_with_options_secrets
from metadata.ingestion.connections.test_connections import (
@@ -135,6 +136,10 @@ def get_connection(connection: TrinoConnection) -> Engine:
# here we are creating a copy of connection, because we need to dynamically
# add auth params to connectionArguments, which we do no intend to store
# in original connection object and in OpenMetadata database
+ from trino.sqlalchemy.dialect import TrinoDialect
+
+ TrinoDialect.is_disconnect = _is_disconnect
+
connection_copy = deepcopy(connection)
if connection_copy.verify:
connection_copy.connectionArguments = (
@@ -183,3 +188,11 @@ def test_connection(
queries=queries,
timeout_seconds=timeout_seconds,
)
+
+
+# pylint: disable=unused-argument
+def _is_disconnect(self, e, connection, cursor):
+ """is_disconnect method for the Databricks dialect"""
+ if "JWT expired" in str(e):
+ return True
+ return False
diff --git a/ingestion/src/metadata/ingestion/source/database/unitycatalog/lineage.py b/ingestion/src/metadata/ingestion/source/database/unitycatalog/lineage.py
index 8f36b033fc10..92813459af5d 100644
--- a/ingestion/src/metadata/ingestion/source/database/unitycatalog/lineage.py
+++ b/ingestion/src/metadata/ingestion/source/database/unitycatalog/lineage.py
@@ -11,6 +11,7 @@
"""
Databricks Unity Catalog Lineage Source Module
"""
+import traceback
from typing import Iterable, Optional
from metadata.generated.schema.api.lineage.addLineage import AddLineageRequest
@@ -27,6 +28,7 @@
EntitiesEdge,
LineageDetails,
)
+from metadata.generated.schema.type.entityLineage import Source as LineageSource
from metadata.generated.schema.type.entityReference import EntityReference
from metadata.ingestion.api.models import Either
from metadata.ingestion.api.steps import InvalidSourceException, Source
@@ -111,9 +113,59 @@ def _get_lineage_details(
)
)
if col_lineage:
- return LineageDetails(columnsLineage=col_lineage)
+ return LineageDetails(
+ columnsLineage=col_lineage, source=LineageSource.QueryLineage
+ )
return None
+ def _handle_upstream_table(
+ self,
+ table_streams: LineageTableStreams,
+ table: Table,
+ databricks_table_fqn: str,
+ ) -> Iterable[Either[AddLineageRequest]]:
+ for upstream_table in table_streams.upstream_tables:
+ try:
+ if not upstream_table.name:
+ continue
+ from_entity_fqn = fqn.build(
+ metadata=self.metadata,
+ entity_type=Table,
+ database_name=upstream_table.catalog_name,
+ schema_name=upstream_table.schema_name,
+ table_name=upstream_table.name,
+ service_name=self.config.serviceName,
+ )
+
+ from_entity = self.metadata.get_by_name(
+ entity=Table, fqn=from_entity_fqn
+ )
+ if from_entity:
+ lineage_details = self._get_lineage_details(
+ from_table=from_entity,
+ to_table=table,
+ databricks_table_fqn=databricks_table_fqn,
+ )
+ yield Either(
+ left=None,
+ right=AddLineageRequest(
+ edge=EntitiesEdge(
+ toEntity=EntityReference(id=table.id, type="table"),
+ fromEntity=EntityReference(
+ id=from_entity.id, type="table"
+ ),
+ lineageDetails=lineage_details,
+ )
+ ),
+ )
+ except Exception:
+ logger.debug(
+ "Error while processing lineage for "
+ f"{upstream_table.catalog_name}.{upstream_table.schema_name}.{upstream_table.name}"
+ f" -> {databricks_table_fqn}"
+ )
+ logger.debug(traceback.format_exc())
+
def _iter(self, *_, **__) -> Iterable[Either[AddLineageRequest]]:
"""
Based on the query logs, prepare the lineage
@@ -130,37 +182,9 @@ def _iter(self, *_, **__) -> Iterable[Either[AddLineageRequest]]:
table_streams: LineageTableStreams = self.client.get_table_lineage(
databricks_table_fqn
)
- for upstream_table in table_streams.upstream_tables:
- from_entity_fqn = fqn.build(
- metadata=self.metadata,
- entity_type=Table,
- database_name=upstream_table.catalog_name,
- schema_name=upstream_table.schema_name,
- table_name=upstream_table.name,
- service_name=self.config.serviceName,
- )
-
- from_entity = self.metadata.get_by_name(
- entity=Table, fqn=from_entity_fqn
- )
- if from_entity:
- lineage_details = self._get_lineage_details(
- from_table=from_entity,
- to_table=table,
- databricks_table_fqn=databricks_table_fqn,
- )
- yield Either(
- left=None,
- right=AddLineageRequest(
- edge=EntitiesEdge(
- toEntity=EntityReference(id=table.id, type="table"),
- fromEntity=EntityReference(
- id=from_entity.id, type="table"
- ),
- lineageDetails=lineage_details,
- )
- ),
- )
+ yield from self._handle_upstream_table(
+ table_streams, table, databricks_table_fqn
+ )
def test_connection(self) -> None:
test_connection_fn = get_test_connection_fn(self.service_connection)
diff --git a/ingestion/src/metadata/ingestion/source/database/unitycatalog/query_parser.py b/ingestion/src/metadata/ingestion/source/database/unitycatalog/query_parser.py
index 5a6b7933a28d..f2ac03b99f98 100644
--- a/ingestion/src/metadata/ingestion/source/database/unitycatalog/query_parser.py
+++ b/ingestion/src/metadata/ingestion/source/database/unitycatalog/query_parser.py
@@ -44,6 +44,13 @@ class UnityCatalogQueryParserSource(
filters: str
+ def _init_super(
+ self,
+ config: WorkflowSource,
+ metadata: OpenMetadata,
+ ):
+ super().__init__(config, metadata, False)
+
# pylint: disable=super-init-not-called
def __init__(self, config: WorkflowSource, metadata: OpenMetadata):
self._init_super(config=config, metadata=metadata)
diff --git a/ingestion/src/metadata/ingestion/source/database/unitycatalog/service_spec.py b/ingestion/src/metadata/ingestion/source/database/unitycatalog/service_spec.py
index ca9f17b254e5..892f88ef7b74 100644
--- a/ingestion/src/metadata/ingestion/source/database/unitycatalog/service_spec.py
+++ b/ingestion/src/metadata/ingestion/source/database/unitycatalog/service_spec.py
@@ -11,6 +11,9 @@
from metadata.profiler.interface.sqlalchemy.unity_catalog.profiler_interface import (
UnityCatalogProfilerInterface,
)
+from metadata.profiler.interface.sqlalchemy.unity_catalog.sampler_interface import (
+ UnityCatalogSamplerInterface,
+)
from metadata.utils.service_spec.default import DefaultDatabaseSpec
ServiceSpec = DefaultDatabaseSpec(
@@ -19,4 +22,5 @@
usage_source_class=UnitycatalogUsageSource,
profiler_class=UnityCatalogProfilerInterface,
test_suite_class=UnityCatalogTestSuiteInterface,
+ sampler_class=UnityCatalogSamplerInterface,
)
diff --git a/ingestion/src/metadata/ingestion/source/database/unitycatalog/usage.py b/ingestion/src/metadata/ingestion/source/database/unitycatalog/usage.py
index 7a310f736a3f..c533454be8cb 100644
--- a/ingestion/src/metadata/ingestion/source/database/unitycatalog/usage.py
+++ b/ingestion/src/metadata/ingestion/source/database/unitycatalog/usage.py
@@ -11,17 +11,22 @@
"""
unity catalog usage module
"""
+import traceback
+from datetime import datetime
+from typing import Iterable
-from metadata.ingestion.source.database.databricks.usage import DatabricksUsageSource
+from metadata.generated.schema.type.basic import DateTime
+from metadata.generated.schema.type.tableQuery import TableQueries, TableQuery
from metadata.ingestion.source.database.unitycatalog.query_parser import (
UnityCatalogQueryParserSource,
)
+from metadata.ingestion.source.database.usage_source import UsageSource
from metadata.utils.logger import ingestion_logger
logger = ingestion_logger()
-class UnitycatalogUsageSource(UnityCatalogQueryParserSource, DatabricksUsageSource):
+class UnitycatalogUsageSource(UnityCatalogQueryParserSource, UsageSource):
"""
UnityCatalog Usage Source
@@ -29,3 +34,37 @@ class UnitycatalogUsageSource(UnityCatalogQueryParserSource, DatabricksUsageSour
DatabricksUsageSource as both the sources would call
the same API for fetching Usage Queries
"""
+
+ def yield_table_queries(self) -> Iterable[TableQuery]:
+ """
+ Method to yield TableQueries
+ """
+ queries = []
+ data = self.client.list_query_history(
+ start_date=self.start,
+ end_date=self.end,
+ )
+ for row in data or []:
+ try:
+ if self.client.is_query_valid(row):
+ queries.append(
+ TableQuery(
+ dialect=self.dialect.value,
+ query=row.get("query_text"),
+ userName=row.get("user_name"),
+ startTime=str(row.get("query_start_time_ms")),
+ endTime=str(row.get("execution_end_time_ms")),
+ analysisDate=DateTime(datetime.now()),
+ serviceName=self.config.serviceName,
+ duration=row.get("duration")
+ if row.get("duration")
+ else None,
+ )
+ )
+ except Exception as err:
+ logger.debug(traceback.format_exc())
+ logger.warning(
+ f"Failed to process query {row.get('query_text')} due to: {err}"
+ )
+
+ yield TableQueries(queries=queries)
diff --git a/ingestion/src/metadata/ingestion/source/database/usage_source.py b/ingestion/src/metadata/ingestion/source/database/usage_source.py
index 6209d601364f..65d5f2635876 100644
--- a/ingestion/src/metadata/ingestion/source/database/usage_source.py
+++ b/ingestion/src/metadata/ingestion/source/database/usage_source.py
@@ -153,7 +153,7 @@ def yield_table_queries(self) -> Iterable[TableQuery]:
if query:
logger.debug(
(
- f"###### USAGE QUERY #######\n{mask_query(query, self.dialect.value)}"
+ f"###### USAGE QUERY #######\n{mask_query(query, self.dialect.value) or query}"
"\n##########################"
)
)
diff --git a/ingestion/src/metadata/ingestion/source/pipeline/nifi/connection.py b/ingestion/src/metadata/ingestion/source/pipeline/nifi/connection.py
index 097b8cc1dbb4..680a8cff54c5 100644
--- a/ingestion/src/metadata/ingestion/source/pipeline/nifi/connection.py
+++ b/ingestion/src/metadata/ingestion/source/pipeline/nifi/connection.py
@@ -17,8 +17,10 @@
from metadata.generated.schema.entity.automations.workflow import (
Workflow as AutomationWorkflow,
)
+from metadata.generated.schema.entity.services.connections.pipeline.nifi.basicAuth import (
+ NifiBasicAuth,
+)
from metadata.generated.schema.entity.services.connections.pipeline.nifiConnection import (
- BasicAuthentication,
NifiConnection,
)
from metadata.generated.schema.entity.services.connections.testConnectionResult import (
@@ -34,7 +36,7 @@ def get_connection(connection: NifiConnection) -> NifiClient:
"""
Create connection
"""
- if isinstance(connection.nifiConfig, BasicAuthentication):
+ if isinstance(connection.nifiConfig, NifiBasicAuth):
return NifiClient(
host_port=connection.hostPort,
username=connection.nifiConfig.username,
diff --git a/ingestion/src/metadata/profiler/interface/sqlalchemy/unity_catalog/sampler_interface.py b/ingestion/src/metadata/profiler/interface/sqlalchemy/unity_catalog/sampler_interface.py
new file mode 100644
index 000000000000..12a4ae3eaacb
--- /dev/null
+++ b/ingestion/src/metadata/profiler/interface/sqlalchemy/unity_catalog/sampler_interface.py
@@ -0,0 +1,29 @@
+# Copyright 2021 Collate
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Interfaces with database for all database engine
+supporting sqlalchemy abstraction layer
+"""
+from metadata.ingestion.source.database.databricks.connection import (
+ get_connection as databricks_get_connection,
+)
+from metadata.sampler.sqlalchemy.sampler import SQASampler
+
+
+class UnityCatalogSamplerInterface(SQASampler):
+ def get_client(self):
+ """client is the session for SQA"""
+ self.connection = databricks_get_connection(self.service_connection_config)
+ self.client = super().get_client()
+ self.set_catalog(self.client)
+
+ return self.client
diff --git a/ingestion/src/metadata/profiler/source/fetcher/config.py b/ingestion/src/metadata/profiler/source/fetcher/config.py
index ef44e9eca1e2..006f1d5181c3 100644
--- a/ingestion/src/metadata/profiler/source/fetcher/config.py
+++ b/ingestion/src/metadata/profiler/source/fetcher/config.py
@@ -42,3 +42,7 @@ def tableFilterPattern(self) -> Optional[FilterPattern]:
@property
def useFqnForFiltering(self) -> Optional[bool]:
...
+
+ @property
+ def includeViews(self) -> Optional[bool]:
+ ...
diff --git a/ingestion/src/metadata/profiler/source/fetcher/fetcher_strategy.py b/ingestion/src/metadata/profiler/source/fetcher/fetcher_strategy.py
index 0fd55f84ccdc..60c04e3a8b81 100644
--- a/ingestion/src/metadata/profiler/source/fetcher/fetcher_strategy.py
+++ b/ingestion/src/metadata/profiler/source/fetcher/fetcher_strategy.py
@@ -17,6 +17,7 @@
from typing import Iterable, Iterator, Optional, cast
from metadata.generated.schema.entity.data.database import Database
+from metadata.generated.schema.entity.data.table import TableType
from metadata.generated.schema.entity.services.ingestionPipelines.status import (
StackTraceError,
)
@@ -115,13 +116,13 @@ def __init__(
super().__init__(config, metadata, global_profiler_config, status)
self.source_config = cast(
EntityFilterConfigInterface, self.source_config
- ) # Satisfy typchecker
+ ) # Satisfy typechecker
def _filter_databases(self, databases: Iterable[Database]) -> Iterable[Database]:
"""Filter databases based on the filter pattern
Args:
- database (Database): Database to filter
+ databases (Database): Database to filter
Returns:
bool
@@ -192,6 +193,21 @@ def _filter_tables(self, table: Table) -> bool:
return False
+ def _filter_views(self, table: Table) -> bool:
+ """Filter the tables based on include views configuration"""
+ # If we include views, nothing to filter
+ if self.source_config.includeViews:
+ return False
+
+ # Otherwise, filter out views
+ if table.tableType == TableType.View:
+ self.status.filter(
+ table.name.root, f"We are not including views {table.name.root}"
+ )
+ return True
+
+ return False
+
def _filter_column_metrics_computation(self):
"""Filter"""
@@ -242,6 +258,7 @@ def _filter_entities(self, tables: Iterable[Table]) -> Iterable[Table]:
not self.source_config.classificationFilterPattern
or not self.filter_classifications(table)
)
+ and not self._filter_views(table)
]
return tables
diff --git a/ingestion/src/metadata/sampler/sampler_interface.py b/ingestion/src/metadata/sampler/sampler_interface.py
index 970793cb2feb..fe363816d01c 100644
--- a/ingestion/src/metadata/sampler/sampler_interface.py
+++ b/ingestion/src/metadata/sampler/sampler_interface.py
@@ -76,9 +76,6 @@ def __init__(
self._columns: Optional[List[SQALikeColumn]] = None
self.sample_config = sample_config
- if not self.sample_config.profileSample:
- self.sample_config.profileSample = 100
-
self.entity = entity
self.include_columns = include_columns
self.exclude_columns = exclude_columns
diff --git a/ingestion/src/metadata/sampler/sqlalchemy/bigquery/sampler.py b/ingestion/src/metadata/sampler/sqlalchemy/bigquery/sampler.py
index 1712ccd6cef8..cd82565506b6 100644
--- a/ingestion/src/metadata/sampler/sqlalchemy/bigquery/sampler.py
+++ b/ingestion/src/metadata/sampler/sqlalchemy/bigquery/sampler.py
@@ -54,7 +54,6 @@ def __init__(
sample_query: Optional[str] = None,
storage_config: DataStorageConfig = None,
sample_data_count: Optional[int] = SAMPLE_DATA_DEFAULT_COUNT,
- table_type: TableType = None,
**kwargs,
):
super().__init__(
@@ -68,7 +67,7 @@ def __init__(
sample_data_count=sample_data_count,
**kwargs,
)
- self.raw_dataset_type: TableType = table_type
+ self.raw_dataset_type: Optional[TableType] = entity.tableType
def set_tablesample(self, selectable: SqaTable):
"""Set the TABLESAMPLE clause for BigQuery
diff --git a/ingestion/src/metadata/sampler/sqlalchemy/sampler.py b/ingestion/src/metadata/sampler/sqlalchemy/sampler.py
index cd87b79368bb..4d28f29f4a95 100644
--- a/ingestion/src/metadata/sampler/sqlalchemy/sampler.py
+++ b/ingestion/src/metadata/sampler/sqlalchemy/sampler.py
@@ -12,6 +12,7 @@
Helper module to handle data sampling
for the profiler
"""
+import hashlib
import traceback
from typing import List, Optional, Union, cast
@@ -32,6 +33,7 @@
from metadata.profiler.orm.functions.random_num import RandomNumFn
from metadata.profiler.processor.handle_partition import build_partition_predicate
from metadata.sampler.sampler_interface import SamplerInterface
+from metadata.utils.constants import UTF_8
from metadata.utils.helpers import is_safe_sql_query
from metadata.utils.logger import profiler_interface_registry_logger
@@ -109,17 +111,28 @@ def _base_sample_query(self, column: Optional[Column], label=None):
query = self.get_partitioned_query(query)
return query
+ def get_sampler_table_name(self) -> str:
+ """Get the base name of the SQA table for sampling.
+ We use MD5 as a hashing algorithm to generate a unique name for the table
+ keeping its length controlled. Otherwise, we ended up having issues
+ with names getting truncated when we add the suffixes to the identifiers
+ such as _sample, or _rnd.
+ """
+ encoded_name = self.raw_dataset.__tablename__.encode(UTF_8)
+ hash_object = hashlib.md5(encoded_name)
+ return hash_object.hexdigest()
+
def get_sample_query(self, *, column=None) -> Query:
"""get query for sample data"""
if self.sample_config.profileSampleType == ProfileSampleType.PERCENTAGE:
rnd = self._base_sample_query(
column,
(ModuloFn(RandomNumFn(), 100)).label(RANDOM_LABEL),
- ).cte(f"{self.raw_dataset.__tablename__}_rnd")
+ ).cte(f"{self.get_sampler_table_name()}_rnd")
session_query = self.client.query(rnd)
return session_query.where(
rnd.c.random <= self.sample_config.profileSample
- ).cte(f"{self.raw_dataset.__tablename__}_sample")
+ ).cte(f"{self.get_sampler_table_name()}_sample")
table_query = self.client.query(self.raw_dataset)
session_query = self._base_sample_query(
@@ -129,7 +142,7 @@ def get_sample_query(self, *, column=None) -> Query:
return (
session_query.order_by(RANDOM_LABEL)
.limit(self.sample_config.profileSample)
- .cte(f"{self.raw_dataset.__tablename__}_rnd")
+ .cte(f"{self.get_sampler_table_name()}_rnd")
)
def get_dataset(self, column=None, **__) -> Union[DeclarativeMeta, AliasedClass]:
@@ -143,7 +156,7 @@ def get_dataset(self, column=None, **__) -> Union[DeclarativeMeta, AliasedClass]
if not self.sample_config.profileSample:
if self.partition_details:
partitioned = self._partitioned_table()
- return partitioned.cte(f"{self.raw_dataset.__tablename__}_partitioned")
+ return partitioned.cte(f"{self.get_sampler_table_name()}_partitioned")
return self.raw_dataset
@@ -162,23 +175,23 @@ def fetch_sample_data(self, columns: Optional[List[Column]] = None) -> TableData
return self._fetch_sample_data_from_user_query()
# Add new RandomNumFn column
- rnd = self.get_sample_query()
+ ds = self.get_dataset()
if not columns:
- sqa_columns = [col for col in inspect(rnd).c if col.name != RANDOM_LABEL]
+ sqa_columns = [col for col in inspect(ds).c if col.name != RANDOM_LABEL]
else:
# we can't directly use columns as it is bound to self.raw_dataset and not the rnd table.
# If we use it, it will result in a cross join between self.raw_dataset and rnd table
names = [col.name for col in columns]
sqa_columns = [
col
- for col in inspect(rnd).c
+ for col in inspect(ds).c
if col.name != RANDOM_LABEL and col.name in names
]
try:
sqa_sample = (
self.client.query(*sqa_columns)
- .select_from(rnd)
+ .select_from(ds)
.limit(self.sample_limit)
.all()
)
@@ -224,7 +237,7 @@ def _rdn_sample_from_user_query(self) -> Query:
stmt = stmt.columns(*list(inspect(self.raw_dataset).c))
return self.client.query(stmt.subquery()).cte(
- f"{self.raw_dataset.__tablename__}_user_sampled"
+ f"{self.get_sampler_table_name()}_user_sampled"
)
def _partitioned_table(self) -> Query:
diff --git a/ingestion/tests/cli_e2e/base/config_builders/builders.py b/ingestion/tests/cli_e2e/base/config_builders/builders.py
index a1a213eba21d..0b0eb4a83d2e 100644
--- a/ingestion/tests/cli_e2e/base/config_builders/builders.py
+++ b/ingestion/tests/cli_e2e/base/config_builders/builders.py
@@ -90,8 +90,8 @@ def build(self) -> dict:
"type": "DatabaseLineage",
"queryLogDuration": 1,
"resultLimit": 10000,
- "processQueryLineage": True,
- "processStoredProcedureLineage": True,
+ "processQueryLineage": False,
+ "processStoredProcedureLineage": False,
}
}
return self.config
diff --git a/ingestion/tests/cli_e2e/test_cli_snowflake.py b/ingestion/tests/cli_e2e/test_cli_snowflake.py
index 77db0556bdfe..5a41378cfbf5 100644
--- a/ingestion/tests/cli_e2e/test_cli_snowflake.py
+++ b/ingestion/tests/cli_e2e/test_cli_snowflake.py
@@ -181,7 +181,7 @@ def view_column_lineage_count(self) -> int:
return 2
def expected_lineage_node(self) -> str:
- return "e2e_snowflake.E2E_DB.E2E_TEST.view_persons"
+ return "e2e_snowflake.E2E_DB.E2E_TEST.VIEW_PERSONS"
@staticmethod
def fqn_created_table() -> str:
diff --git a/ingestion/tests/cli_e2e/test_cli_vertica.py b/ingestion/tests/cli_e2e/test_cli_vertica.py
index 1591c9d160f3..f0e66ac784b2 100644
--- a/ingestion/tests/cli_e2e/test_cli_vertica.py
+++ b/ingestion/tests/cli_e2e/test_cli_vertica.py
@@ -74,7 +74,7 @@ def view_column_lineage_count(self) -> int:
return 2
def expected_lineage_node(self) -> str:
- return "e2e_vertica.VMart.public.vendor_dimension_v"
+ return "e2e_vertica.VMart.public.vendor_dimension"
@staticmethod
def fqn_created_table() -> str:
diff --git a/ingestion/tests/integration/integration_base.py b/ingestion/tests/integration/integration_base.py
index bed6db07c0e1..ab65a16089ae 100644
--- a/ingestion/tests/integration/integration_base.py
+++ b/ingestion/tests/integration/integration_base.py
@@ -162,7 +162,7 @@
"serviceConnection": {{
"config": {service_config}
}},
- "sourceConfig": {{"config": {{"type":"Profiler"}}}}
+ "sourceConfig": {{"config": {{"type":"Profiler", "profileSample": 100}}}}
}},
"processor": {{"type": "orm-profiler", "config": {{}}}},
"sink": {{"type": "metadata-rest", "config": {{}}}},
diff --git a/ingestion/tests/integration/ometa/test_ometa_custom_properties_api.py b/ingestion/tests/integration/ometa/test_ometa_custom_properties_api.py
index 960a15adae92..cd8e7515fa6b 100644
--- a/ingestion/tests/integration/ometa/test_ometa_custom_properties_api.py
+++ b/ingestion/tests/integration/ometa/test_ometa_custom_properties_api.py
@@ -93,7 +93,7 @@
"description": "Rating of a table",
"propertyType": {"name": "enum"},
"customPropertyConfig": {
- "config": {"values": ["Good", "Average", "Bad"], "multiSelect": False},
+ "config": {"values": ["Average", "Bad", "Good"], "multiSelect": False},
},
},
{
diff --git a/ingestion/tests/integration/orm_profiler/test_orm_profiler_e2e.py b/ingestion/tests/integration/orm_profiler/test_orm_profiler_e2e.py
index 2d1d976ec72e..d6ca5e8a7e0e 100644
--- a/ingestion/tests/integration/orm_profiler/test_orm_profiler_e2e.py
+++ b/ingestion/tests/integration/orm_profiler/test_orm_profiler_e2e.py
@@ -549,8 +549,7 @@ def test_workflow_values_partition(ingest, metadata, service_name):
profile = metadata.get_latest_table_profile(table.fullyQualifiedName).profile
assert profile.rowCount == 4.0
- # If we don't have any sample, default to 100
- assert profile.profileSample == 100.0
+ assert profile.profileSample == None
workflow_config["processor"] = {
"type": "orm-profiler",
diff --git a/ingestion/tests/integration/trino/hive/Dockerfile b/ingestion/tests/integration/trino/hive/Dockerfile
index eb218fd1d48c..576c37d40a35 100644
--- a/ingestion/tests/integration/trino/hive/Dockerfile
+++ b/ingestion/tests/integration/trino/hive/Dockerfile
@@ -2,8 +2,8 @@ ARG BASE_IMAGE=bitsondatadev/hive-metastore:latest
FROM ${BASE_IMAGE}
COPY conf/metastore-site.xml /opt/apache-hive-metastore-3.0.0-bin/conf/metastore-site.xml
COPY entrypoint.sh /entrypoint.sh
-ENV JDBC_CONNECTION_URL ""
-ENV MINIO_ENDPOINT ""
+ENV JDBC_CONNECTION_URL=""
+ENV MINIO_ENDPOINT=""
USER root
RUN chmod +x /entrypoint.sh
USER hive
\ No newline at end of file
diff --git a/ingestion/tests/integration/trino/test_profiler.py b/ingestion/tests/integration/trino/test_profiler.py
index 6e12ca7b7b0c..e8092c1ded7d 100644
--- a/ingestion/tests/integration/trino/test_profiler.py
+++ b/ingestion/tests/integration/trino/test_profiler.py
@@ -65,7 +65,7 @@ class ProfilerTestParameters:
ColumnProfile(
name="three",
timestamp=Timestamp(0),
- valuesCount=1,
+ valuesCount=2,
nullCount=1,
)
],
@@ -101,7 +101,7 @@ class ProfilerTestParameters:
ColumnProfile(
name="gender",
timestamp=Timestamp(0),
- valuesCount=932,
+ valuesCount=1000,
nullCount=0,
)
],
diff --git a/ingestion/tests/integration/trino/trino/Dockerfile b/ingestion/tests/integration/trino/trino/Dockerfile
index 87dd3e47bdc8..98c0ab84592b 100644
--- a/ingestion/tests/integration/trino/trino/Dockerfile
+++ b/ingestion/tests/integration/trino/trino/Dockerfile
@@ -1,5 +1,5 @@
ARG BASE_IMAGE
FROM ${BASE_IMAGE}
-ENV MINIO_ENDPOINT ""
-ENV HIVE_METASTORE_URI ""
+ENV MINIO_ENDPOINT=""
+ENV HIVE_METASTORE_URI=""
COPY etc /etc/trino
\ No newline at end of file
diff --git a/ingestion/tests/unit/profiler/sqlalchemy/bigquery/test_bigquery_sampling.py b/ingestion/tests/unit/profiler/sqlalchemy/bigquery/test_bigquery_sampling.py
index 4f8de1611375..5c719d7fc753 100644
--- a/ingestion/tests/unit/profiler/sqlalchemy/bigquery/test_bigquery_sampling.py
+++ b/ingestion/tests/unit/profiler/sqlalchemy/bigquery/test_bigquery_sampling.py
@@ -127,20 +127,31 @@ def test_sampling_for_views(self, sampler_mock):
"""
Test view sampling
"""
+ view_entity = Table(
+ id=uuid4(),
+ name="user",
+ columns=[
+ EntityColumn(
+ name=ColumnName("id"),
+ dataType=DataType.INT,
+ ),
+ ],
+ tableType=TableType.View,
+ )
+
sampler = BigQuerySampler(
service_connection_config=self.bq_conn,
ometa_client=None,
- entity=self.table_entity,
+ entity=view_entity,
sample_config=SampleConfig(
profileSampleType=ProfileSampleType.PERCENTAGE, profileSample=50.0
),
- table_type=TableType.View,
)
query: CTE = sampler.get_sample_query()
expected_query = (
- "WITH users_rnd AS \n(SELECT users.id AS id, ABS(RANDOM()) * 100 %% 100 AS random \n"
- "FROM users)\n SELECT users_rnd.id, users_rnd.random \n"
- "FROM users_rnd \nWHERE users_rnd.random <= 50.0"
+ 'WITH "9bc65c2abec141778ffaa729489f3e87_rnd" AS \n(SELECT users.id AS id, ABS(RANDOM()) * 100 %% 100 AS random \n'
+ 'FROM users)\n SELECT "9bc65c2abec141778ffaa729489f3e87_rnd".id, "9bc65c2abec141778ffaa729489f3e87_rnd".random \n'
+ 'FROM "9bc65c2abec141778ffaa729489f3e87_rnd" \nWHERE "9bc65c2abec141778ffaa729489f3e87_rnd".random <= 50.0'
)
assert (
expected_query.casefold()
@@ -151,10 +162,22 @@ def test_sampling_view_with_partition(self, sampler_mock):
"""
Test view sampling with partition
"""
+ view_entity = Table(
+ id=uuid4(),
+ name="user",
+ columns=[
+ EntityColumn(
+ name=ColumnName("id"),
+ dataType=DataType.INT,
+ ),
+ ],
+ tableType=TableType.View,
+ )
+
sampler = BigQuerySampler(
service_connection_config=self.bq_conn,
ometa_client=None,
- entity=self.table_entity,
+ entity=view_entity,
sample_config=SampleConfig(
profileSampleType=ProfileSampleType.PERCENTAGE, profileSample=50.0
),
@@ -168,9 +191,9 @@ def test_sampling_view_with_partition(self, sampler_mock):
)
query: CTE = sampler.get_sample_query()
expected_query = (
- "WITH users_rnd AS \n(SELECT users.id AS id, ABS(RANDOM()) * 100 %% 100 AS random \n"
- "FROM users \nWHERE id in ('1', '2'))\n SELECT users_rnd.id, users_rnd.random \n"
- "FROM users_rnd \nWHERE users_rnd.random <= 50.0"
+ 'WITH "9bc65c2abec141778ffaa729489f3e87_rnd" AS \n(SELECT users.id AS id, ABS(RANDOM()) * 100 %% 100 AS random \n'
+ "FROM users \nWHERE id in ('1', '2'))\n SELECT \"9bc65c2abec141778ffaa729489f3e87_rnd\".id, \"9bc65c2abec141778ffaa729489f3e87_rnd\".random \n"
+ 'FROM "9bc65c2abec141778ffaa729489f3e87_rnd" \nWHERE "9bc65c2abec141778ffaa729489f3e87_rnd".random <= 50.0'
)
assert (
expected_query.casefold()
diff --git a/ingestion/tests/unit/profiler/test_entity_fetcher.py b/ingestion/tests/unit/profiler/test_entity_fetcher.py
new file mode 100644
index 000000000000..9e1d475cd661
--- /dev/null
+++ b/ingestion/tests/unit/profiler/test_entity_fetcher.py
@@ -0,0 +1,88 @@
+# Copyright 2021 Collate
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Validate entity fetcher filtering strategies
+"""
+import uuid
+
+from metadata.generated.schema.entity.data.table import Table, TableType
+from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import (
+ OpenMetadataConnection,
+)
+from metadata.generated.schema.metadataIngestion.databaseServiceAutoClassificationPipeline import (
+ DatabaseServiceAutoClassificationPipeline,
+)
+from metadata.generated.schema.metadataIngestion.workflow import (
+ OpenMetadataWorkflowConfig,
+ Source,
+ SourceConfig,
+ WorkflowConfig,
+)
+from metadata.ingestion.api.status import Status
+from metadata.profiler.source.fetcher.fetcher_strategy import DatabaseFetcherStrategy
+
+VIEW = Table(
+ id=uuid.uuid4(),
+ name="view",
+ columns=[],
+ tableType=TableType.View,
+)
+
+TABLE = Table(
+ id=uuid.uuid4(),
+ name="table",
+ columns=[],
+ tableType=TableType.Regular,
+)
+
+
+def get_db_fetcher(source_config):
+ """Fetch database"""
+ workflow_config = OpenMetadataWorkflowConfig(
+ source=Source(
+ type="mysql",
+ serviceName="mysql",
+ sourceConfig=SourceConfig(
+ config=source_config,
+ ),
+ ),
+ workflowConfig=WorkflowConfig(
+ openMetadataServerConfig=OpenMetadataConnection(
+ hostPort="localhost:8585/api",
+ )
+ ),
+ )
+ return DatabaseFetcherStrategy(
+ config=workflow_config,
+ metadata=...,
+ global_profiler_config=...,
+ status=Status(),
+ )
+
+
+def test_include_views():
+ """Validate we can include/exclude views"""
+ config = DatabaseServiceAutoClassificationPipeline(
+ includeViews=False,
+ )
+ fetcher = get_db_fetcher(config)
+
+ assert fetcher._filter_views(VIEW)
+ assert not fetcher._filter_views(TABLE)
+
+ config = DatabaseServiceAutoClassificationPipeline(
+ includeViews=True,
+ )
+ fetcher = get_db_fetcher(config)
+
+ assert not fetcher._filter_views(VIEW)
+ assert not fetcher._filter_views(TABLE)
diff --git a/ingestion/tests/unit/test_databricks_lineage.py b/ingestion/tests/unit/test_databricks_lineage.py
index be2d4feb4af4..8647f63f3603 100644
--- a/ingestion/tests/unit/test_databricks_lineage.py
+++ b/ingestion/tests/unit/test_databricks_lineage.py
@@ -135,19 +135,3 @@ def __init__(self, methodName) -> None:
mock_databricks_config["source"],
config.workflowConfig.openMetadataServerConfig,
)
-
- @patch(
- "metadata.ingestion.source.database.databricks.client.DatabricksClient.list_query_history"
- )
- def test_get_table_query(self, list_query_history):
- list_query_history.return_value = mock_data
- results = self.databricks.get_table_query()
- query_list = []
- for result in results:
- if isinstance(result, TableQuery):
- query_list.append(result)
- for _, (expected, original) in enumerate(
- zip(EXPECTED_DATABRICKS_DETAILS, query_list)
- ):
- expected.analysisDate = original.analysisDate = datetime.now()
- self.assertEqual(expected, original)
diff --git a/ingestion/tests/unit/topology/dashboard/test_looker.py b/ingestion/tests/unit/topology/dashboard/test_looker.py
index 9acdc55111f8..5f0ca6b485b4 100644
--- a/ingestion/tests/unit/topology/dashboard/test_looker.py
+++ b/ingestion/tests/unit/topology/dashboard/test_looker.py
@@ -125,6 +125,13 @@
serviceType=DashboardServiceType.Looker,
)
+EXPECTED_PARSED_VIEWS = {
+ "v1": "table1",
+ "v2": "select * from v2",
+ "v3": "select * from (select * from v2)",
+ "v4": "select * from (select * from (select * from v2)) inner join (table1)",
+}
+
class LookerUnitTest(TestCase):
"""
@@ -560,3 +567,33 @@ def test_yield_dashboard_usage(self):
self.assertIsNotNone(
list(self.looker.yield_dashboard_usage(MOCK_LOOKER_DASHBOARD))[0].left
)
+
+ def test_derived_view_references(self):
+ """
+ Validate if we can find derived references in a SQL query
+ and replace them with their actual values
+ """
+ # pylint: disable=protected-access
+ self.looker._parsed_views.update(
+ {
+ "v1": "table1",
+ "v2": "select * from v2",
+ }
+ )
+ self.looker._unparsed_views.update(
+ {
+ "v3": "select * from ${v2.SQL_TABLE_NAME}",
+ "v4": "select * from ${v3.SQL_TABLE_NAME} inner join ${v1.SQL_TABLE_NAME}",
+ }
+ )
+ self.looker._derived_dependencies.add_edges_from(
+ [
+ ("v3", "v2"),
+ ("v4", "v3"),
+ ("v4", "v1"),
+ ]
+ )
+ list(self.looker.build_lineage_for_unparsed_views())
+
+ self.assertEqual(self.looker._parsed_views, EXPECTED_PARSED_VIEWS)
+ self.assertEqual(self.looker._unparsed_views, {})
diff --git a/ingestion/tests/unit/topology/dashboard/test_superset.py b/ingestion/tests/unit/topology/dashboard/test_superset.py
index b229095f4675..b8ea2c77c82a 100644
--- a/ingestion/tests/unit/topology/dashboard/test_superset.py
+++ b/ingestion/tests/unit/topology/dashboard/test_superset.py
@@ -18,6 +18,7 @@
from unittest import TestCase
import sqlalchemy
+from collate_sqllineage.core.models import Column, Schema, SubQuery, Table
from testcontainers.core.generic import DockerContainer
from testcontainers.postgres import PostgresContainer
@@ -56,6 +57,7 @@
from metadata.generated.schema.type.entityReference import EntityReference
from metadata.generated.schema.type.entityReferenceList import EntityReferenceList
from metadata.ingestion.api.steps import InvalidSourceException
+from metadata.ingestion.lineage.parser import LineageParser
from metadata.ingestion.ometa.ometa_api import OpenMetadata
from metadata.ingestion.source.dashboard.superset.api_source import SupersetAPISource
from metadata.ingestion.source.dashboard.superset.db_source import SupersetDBSource
@@ -170,7 +172,6 @@
owners=EXPECTED_USER,
)
-
EXPECTED_API_DASHBOARD = CreateDashboardRequest(
name=EntityName("10"),
displayName="Unicode Test",
@@ -279,13 +280,33 @@ def setup_sample_data(postgres_container):
id INTEGER PRIMARY KEY,
table_name VARCHAR(255),
schema VARCHAR(255),
- database_id INTEGER
+ database_id INTEGER,
+ sql VARCHAR(4000)
);
"""
INSERT_TABLES_DATA = """
INSERT INTO tables(id, table_name, schema, database_id)
VALUES (99, 'sample_table', 'main', 5);
"""
+ CREATE_TABLE_COLUMNS_TABLE = """
+ CREATE TABLE table_columns (
+ id INTEGER PRIMARY KEY,
+ table_name VARCHAR(255),
+ table_id INTEGER,
+ column_name VARCHAR(255),
+ type VARCHAR(255),
+ description VARCHAR(255)
+ );
+ """
+ CREATE_TABLE_COLUMNS_DATA = """
+ INSERT INTO
+ table_columns(id, table_name, table_id, column_name, type, description)
+ VALUES
+ (1099, 'sample_table', 99, 'id', 'VARCHAR', 'dummy description'),
+ (1199, 'sample_table', 99, 'timestamp', 'VARCHAR', 'dummy description'),
+ (1299, 'sample_table', 99, 'price', 'VARCHAR', 'dummy description');
+ """
+
connection.execute(sqlalchemy.text(CREATE_TABLE_AB_USER))
connection.execute(sqlalchemy.text(INSERT_AB_USER_DATA))
connection.execute(sqlalchemy.text(CREATE_TABLE_DASHBOARDS))
@@ -296,6 +317,8 @@ def setup_sample_data(postgres_container):
connection.execute(sqlalchemy.text(INSERT_DBS_DATA))
connection.execute(sqlalchemy.text(CREATE_TABLES_TABLE))
connection.execute(sqlalchemy.text(INSERT_TABLES_DATA))
+ connection.execute(sqlalchemy.text(CREATE_TABLE_COLUMNS_TABLE))
+ connection.execute(sqlalchemy.text(CREATE_TABLE_COLUMNS_DATA))
INITIAL_SETUP = True
@@ -616,3 +639,135 @@ def test_broken_column_type_in_datamodel(self):
self.superset_db.prepare()
parsed_datasource = self.superset_db.get_column_info(MOCK_DATASOURCE)
assert parsed_datasource[0].dataType.value == "INT"
+
+ def test_is_table_to_table_lineage(self):
+ table = Table(name="table_name", schema=Schema(name="schema_name"))
+
+ for test_case in [
+ (
+ (
+ Column(name="col_name"),
+ Table(name="table_name", schema=Schema(name="schema_name")),
+ Column(name="col_name"),
+ Table(name="dataset_name", schema=Schema(name="schema_name")),
+ ),
+ True,
+ ),
+ (
+ (
+ Column(name="col_name"),
+ Table(name="table_name", schema=Schema(name=Schema.unknown)),
+ Column(name="col_name"),
+ Table(name="dataset_name", schema=Schema(name="schema_name")),
+ ),
+ False,
+ ),
+ (
+ (
+ Column(name="col_name"),
+ Table(name="other_table_name", schema=Schema(name="schema_name")),
+ Column(name="col_name"),
+ Table(name="dataset_name", schema=Schema(name="schema_name")),
+ ),
+ False,
+ ),
+ (
+ (
+ Column(name="col_name"),
+ Table(name="table_name", schema=Schema(name="schema_name")),
+ Column(name="col_name"),
+ SubQuery(
+ subquery="select * from 1",
+ subquery_raw="select * from 1",
+ alias="dummy_subquery",
+ ),
+ ),
+ False,
+ ),
+ ]:
+ _columns, expected = test_case
+
+ column_from, column_from_parent, column_to, column_to_parent = _columns
+
+ column_from._parent.add(column_from_parent)
+ column_to._parent.add(column_to_parent)
+
+ columns = (column_from, column_to)
+ self.assertEqual(
+ self.superset_db._is_table_to_table_lineage(columns, table), expected
+ )
+
+ def test_append_value_to_dict_list(self):
+ init_dict = {1: [2]}
+
+ self.superset_db._append_value_to_dict_list(init_dict, 1, 3)
+ self.assertListEqual(init_dict[1], [2, 3])
+
+ self.superset_db._append_value_to_dict_list(init_dict, 2, 1)
+ self.assertListEqual(init_dict[2], [1])
+
+ def test_get_table_schema(self):
+ for test_case in [
+ (
+ Table(name="test_table", schema=Schema(name=Schema.unknown)),
+ FetchChart(schema="chart_table_schema"),
+ "chart_table_schema",
+ ),
+ (
+ Table(name="test_table", schema=Schema(name="test_schema")),
+ FetchChart(schema="chart_table_schema"),
+ "test_schema",
+ ),
+ ]:
+ table, chart, expected = test_case
+
+ self.assertEqual(self.superset_db._get_table_schema(table, chart), expected)
+
+ def test_create_column_lineage_mapping_no_wildcard(self):
+ sql = """
+ INSERT INTO dummy_table SELECT id, timestamp FROM input_table;
+ """
+
+ parser = LineageParser(sql)
+ table = Table(name="input_table", schema=Schema(name=Schema.unknown))
+ chart = FetchChart(table_name="sample_table", table_schema="main", table_id=99)
+
+ expected = {"id": ["id"], "timestamp": ["timestamp"]}
+
+ self.assertDictEqual(
+ self.superset_db._create_column_lineage_mapping(parser, table, chart),
+ expected,
+ )
+
+ def test_create_column_lineage_mapping_with_wildcard(self):
+ sql = """
+ INSERT INTO dummy_table SELECT * FROM input_table;
+ """
+
+ parser = LineageParser(sql)
+ table = Table(name="input_table", schema=Schema(name=Schema.unknown))
+ chart = FetchChart(table_name="sample_table", table_schema="main", table_id=99)
+
+ expected = {"id": ["id"], "timestamp": ["timestamp"], "price": ["price"]}
+
+ self.assertDictEqual(
+ self.superset_db._create_column_lineage_mapping(parser, table, chart),
+ expected,
+ )
+
+ def test_get_input_tables_from_dataset_sql(self):
+ sql = """SELECT id, timestamp FROM sample_table"""
+ chart = FetchChart(
+ sql=sql, table_name="sample_table", table_schema="main", table_id=99
+ )
+
+ result = self.superset_db._get_input_tables(chart)[0]
+
+ self.assertSetEqual({"id", "timestamp"}, set(result[1]))
+
+ def test_get_input_tables_when_table_has_no_sql(self):
+ chart = FetchChart(table_name="sample_table", table_schema="main", table_id=99)
+
+ result = self.superset_db._get_input_tables(chart)[0]
+
+ self.assertSetEqual({"id", "timestamp", "price"}, set(result[1]))
diff --git a/ingestion/tests/utils/sqa.py b/ingestion/tests/utils/sqa.py
index 19770c3f2969..73e370b7173c 100644
--- a/ingestion/tests/utils/sqa.py
+++ b/ingestion/tests/utils/sqa.py
@@ -18,6 +18,15 @@ class User(Base):
age = Column(Integer)
+class UserWithLongName(Base):
+ __tablename__ = "u" * 63 # Keep a max length name of 63 chars (max for Postgres)
+ id = Column(Integer, primary_key=True)
+ name = Column(String(256))
+ fullname = Column(String(256))
+ nickname = Column(String(256))
+ age = Column(Integer)
+
+
class SQATestUtils:
def __init__(self, connection_url: str):
self.connection_url = connection_url
@@ -34,14 +43,16 @@ def load_data(self, data: Sequence[DeclarativeMeta]):
self.session.commit()
def load_user_data(self):
- data = [
- User(name="John", fullname="John Doe", nickname="johnny b goode", age=30), # type: ignore
- User(name="Jane", fullname="Jone Doe", nickname=None, age=31), # type: ignore
- ] * 20
- self.load_data(data)
+ for clz in (User, UserWithLongName):
+ data = [
+ clz(name="John", fullname="John Doe", nickname="johnny b goode", age=30), # type: ignore
+ clz(name="Jane", fullname="Jone Doe", nickname=None, age=31), # type: ignore
+ ] * 20
+ self.load_data(data)
def create_user_table(self):
User.__table__.create(bind=self.session.get_bind())
+ UserWithLongName.__table__.create(bind=self.session.get_bind())
def close(self):
self.session.close()
diff --git a/openmetadata-docs/content/partials/v1.6/connectors/dashboard/connectors-list.md b/openmetadata-docs/content/partials/v1.6/connectors/dashboard/connectors-list.md
index 56bcab394e64..d91486345463 100644
--- a/openmetadata-docs/content/partials/v1.6/connectors/dashboard/connectors-list.md
+++ b/openmetadata-docs/content/partials/v1.6/connectors/dashboard/connectors-list.md
@@ -7,12 +7,12 @@
{% connectorInfoCard name="MicroStrategy" stage="PROD" href="/connectors/dashboard/microstrategy" platform="OpenMetadata" / %}
{% connectorInfoCard name="Mode" stage="PROD" href="/connectors/dashboard/mode" platform="OpenMetadata" / %}
{% connectorInfoCard name="PowerBI" stage="PROD" href="/connectors/dashboard/powerbi" platform="OpenMetadata" / %}
+{% connectorInfoCard name="PowerBI Report Server" stage="PROD" href="/connectors/dashboard/powerbireportserver" platform="Collate" / %}
{% connectorInfoCard name="Qlik Sense" stage="PROD" href="/connectors/dashboard/qliksense" platform="OpenMetadata" / %}
{% connectorInfoCard name="QuickSight" stage="PROD" href="/connectors/dashboard/quicksight" platform="OpenMetadata" / %}
{% connectorInfoCard name="Redash" stage="PROD" href="/connectors/dashboard/redash" platform="OpenMetadata" / %}
{% connectorInfoCard name="Superset" stage="PROD" href="/connectors/dashboard/superset" platform="OpenMetadata" / %}
{% connectorInfoCard name="Sigma" stage="PROD" href="/connectors/dashboard/sigma" platform="OpenMetadata" / %}
-{% connectorInfoCard name="PowerBI" stage="PROD" href="/connectors/dashboard/powerbi" platform="OpenMetadata" / %}
{% connectorInfoCard name="Tableau" stage="PROD" href="/connectors/dashboard/tableau" platform="OpenMetadata" / %}
{% /connectorsListContainer %}
\ No newline at end of file
diff --git a/openmetadata-docs/content/partials/v1.6/connectors/database/connectors-list.md b/openmetadata-docs/content/partials/v1.6/connectors/database/connectors-list.md
index 2a8002e8ba7c..4791916897cc 100644
--- a/openmetadata-docs/content/partials/v1.6/connectors/database/connectors-list.md
+++ b/openmetadata-docs/content/partials/v1.6/connectors/database/connectors-list.md
@@ -38,6 +38,7 @@
{% connectorInfoCard name="SingleStore" stage="PROD" href="/connectors/database/singlestore" platform="OpenMetadata" / %}
{% connectorInfoCard name="Snowflake" stage="PROD" href="/connectors/database/snowflake" platform="OpenMetadata" / %}
{% connectorInfoCard name="SQLite" stage="PROD" href="/connectors/database/sqlite" platform="OpenMetadata" / %}
+{% connectorInfoCard name="Synapse" stage="PROD" href="/connectors/database/synapse" platform="Collate" / %}
{% connectorInfoCard name="S3 Datalake" stage="PROD" href="/connectors/database/s3-datalake" platform="OpenMetadata" / %}
{% connectorInfoCard name="Teradata" stage="PROD" href="/connectors/database/teradata" platform="OpenMetadata" / %}
{% connectorInfoCard name="Trino" stage="PROD" href="/connectors/database/trino" platform="OpenMetadata" / %}
diff --git a/openmetadata-docs/content/partials/v1.6/connectors/yaml/auto-classification.md b/openmetadata-docs/content/partials/v1.6/connectors/yaml/auto-classification.md
index 42d7263a154a..25727a1f3121 100644
--- a/openmetadata-docs/content/partials/v1.6/connectors/yaml/auto-classification.md
+++ b/openmetadata-docs/content/partials/v1.6/connectors/yaml/auto-classification.md
@@ -151,4 +151,8 @@ After saving the YAML config, we will run the command the same way we did for th
metadata classify -c
```
-Note now instead of running `ingest`, we are using the `classify` command to select the Auto Classification workflow.
+{% note %}
+
+Now instead of running `ingest`, we are using the `classify` command to select the Auto Classification workflow.
+
+{% /note %}
diff --git a/openmetadata-docs/content/partials/v1.6/connectors/yaml/lineage.md b/openmetadata-docs/content/partials/v1.6/connectors/yaml/lineage.md
index d695495b6f02..bb5e97cbafcf 100644
--- a/openmetadata-docs/content/partials/v1.6/connectors/yaml/lineage.md
+++ b/openmetadata-docs/content/partials/v1.6/connectors/yaml/lineage.md
@@ -3,7 +3,6 @@
After running a Metadata Ingestion workflow, we can run Lineage workflow.
While the `serviceName` will be the same to that was used in Metadata Ingestion, so the ingestion bot can get the `serviceConnection` details from the server.
-
### 1. Define the YAML Config
This is a sample config for BigQuery Lineage:
@@ -67,35 +66,35 @@ You can find all the definitions and types for the `sourceConfig` [here](https:
{% /codeInfo %}
-{% codeInfo srNumber=49 %}
+{% codeInfo srNumber=51 %}
**overrideViewLineage**: Set the 'Override View Lineage' toggle to control whether to override the existing view lineage.
{% /codeInfo %}
-{% codeInfo srNumber=51 %}
+{% codeInfo srNumber=52 %}
**processViewLineage**: Set the 'Process View Lineage' toggle to control whether to process view lineage.
{% /codeInfo %}
-{% codeInfo srNumber=52 %}
+{% codeInfo srNumber=53 %}
**processQueryLineage**: Set the 'Process Query Lineage' toggle to control whether to process query lineage.
{% /codeInfo %}
-{% codeInfo srNumber=53 %}
+{% codeInfo srNumber=54 %}
**processStoredProcedureLineage**: Set the 'Process Stored ProcedureLog Lineage' toggle to control whether to process stored procedure lineage.
{% /codeInfo %}
-{% codeInfo srNumber=54 %}
+{% codeInfo srNumber=55 %}
**threads**: Number of Threads to use in order to parallelize lineage ingestion.
@@ -107,6 +106,7 @@ You can find all the definitions and types for the `sourceConfig` [here](https:
#### Sink Configuration
To send the metadata to OpenMetadata, it needs to be specified as `type: metadata-rest`.
+
{% /codeInfo %}
@@ -178,13 +178,6 @@ source:
# - table3
# - table4
```
-
-```yaml {% srNumber=49 %}
-sink:
- type: metadata-rest
- config: {}
-```
-
```yaml {% srNumber=51 %}
overrideViewLineage: false
```
@@ -205,6 +198,12 @@ sink:
threads: 1
```
+```yaml {% srNumber=49 %}
+sink:
+ type: metadata-rest
+ config: {}
+```
+
{% partial file="/v1.6/connectors/yaml/workflow-config.md" /%}
{% /codeBlock %}
diff --git a/openmetadata-docs/content/partials/v1.6/deployment/upgrade/upgrade-prerequisites.md b/openmetadata-docs/content/partials/v1.6/deployment/upgrade/upgrade-prerequisites.md
index 78def9dcdd72..a35feb19c047 100644
--- a/openmetadata-docs/content/partials/v1.6/deployment/upgrade/upgrade-prerequisites.md
+++ b/openmetadata-docs/content/partials/v1.6/deployment/upgrade/upgrade-prerequisites.md
@@ -46,6 +46,23 @@ You can refer to the following guide to get more details about the backup and re
{% /inlineCallout %}
{% /inlineCalloutContainer %}
+## Understanding the "Running" State in OpenMetadata
+
+In OpenMetadata, the **"Running"** state indicates that the OpenMetadata server has received a response from Airflow confirming that a workflow is in progress. However, if Airflow unexpectedly stops or crashes before it can send a failure status update through the **Failure Callback**, OpenMetadata remains unaware of the workflow’s actual state. As a result, the workflow may appear to be stuck in **"Running"** even though it is no longer executing.
+
+This situation can also occur during an OpenMetadata upgrade. If an ingestion pipeline was running at the time of the upgrade and the process caused Airflow to shut down, OpenMetadata would not receive any further updates from Airflow. Consequently, the pipeline status remains **"Running"** indefinitely.
+
+{% image
+ src="/images/v1.6/deployment/upgrade/running-state-in-openmetadata.png"
+ alt="Running State in OpenMetadata"
+ caption="Running State in OpenMetadata" /%}
+
+### Expected Steps to Resolve
+To resolve this issue:
+- Ensure that Airflow is restarted properly after an unexpected shutdown.
+- Manually update the pipeline status if necessary.
+- Check Airflow logs to verify if the DAG execution was interrupted.
+
### Update `sort_buffer_size` (MySQL) or `work_mem` (Postgres)
Before running the migrations, it is important to update these parameters to ensure there are no runtime errors.
@@ -86,6 +103,18 @@ After the migration is finished, you can revert this changes.
# Backward Incompatible Changes
+## 1.6.4
+
+### Airflow 2.9.3
+
+We are upgrading the Ingestion Airflow version to 2.9.3.
+
+The upgrade from the existing 2.9.1 -> 2.9.3 should happen transparently. The only thing to note is that there's
+an ongoing issue with Airflow migrations and the `pymysql` driver, which we used before. If you are specifying
+on your end the `DB_SCHEME` environment variable in the ingestion image, make sure it now is set to `mysql+mysqldb`.
+
+We have updated the default values accordingly.
+
## 1.6.2
### Executable Logical Test Suites
diff --git a/openmetadata-docs/content/partials/v1.7/connectors/dashboard/connectors-list.md b/openmetadata-docs/content/partials/v1.7/connectors/dashboard/connectors-list.md
index 56bcab394e64..d91486345463 100644
--- a/openmetadata-docs/content/partials/v1.7/connectors/dashboard/connectors-list.md
+++ b/openmetadata-docs/content/partials/v1.7/connectors/dashboard/connectors-list.md
@@ -7,12 +7,12 @@
{% connectorInfoCard name="MicroStrategy" stage="PROD" href="/connectors/dashboard/microstrategy" platform="OpenMetadata" / %}
{% connectorInfoCard name="Mode" stage="PROD" href="/connectors/dashboard/mode" platform="OpenMetadata" / %}
{% connectorInfoCard name="PowerBI" stage="PROD" href="/connectors/dashboard/powerbi" platform="OpenMetadata" / %}
+{% connectorInfoCard name="PowerBI Report Server" stage="PROD" href="/connectors/dashboard/powerbireportserver" platform="Collate" / %}
{% connectorInfoCard name="Qlik Sense" stage="PROD" href="/connectors/dashboard/qliksense" platform="OpenMetadata" / %}
{% connectorInfoCard name="QuickSight" stage="PROD" href="/connectors/dashboard/quicksight" platform="OpenMetadata" / %}
{% connectorInfoCard name="Redash" stage="PROD" href="/connectors/dashboard/redash" platform="OpenMetadata" / %}
{% connectorInfoCard name="Superset" stage="PROD" href="/connectors/dashboard/superset" platform="OpenMetadata" / %}
{% connectorInfoCard name="Sigma" stage="PROD" href="/connectors/dashboard/sigma" platform="OpenMetadata" / %}
-{% connectorInfoCard name="PowerBI" stage="PROD" href="/connectors/dashboard/powerbi" platform="OpenMetadata" / %}
{% connectorInfoCard name="Tableau" stage="PROD" href="/connectors/dashboard/tableau" platform="OpenMetadata" / %}
{% /connectorsListContainer %}
\ No newline at end of file
diff --git a/openmetadata-docs/content/partials/v1.7/connectors/database/connectors-list.md b/openmetadata-docs/content/partials/v1.7/connectors/database/connectors-list.md
index 738373a226bd..af311463fc4f 100644
--- a/openmetadata-docs/content/partials/v1.7/connectors/database/connectors-list.md
+++ b/openmetadata-docs/content/partials/v1.7/connectors/database/connectors-list.md
@@ -40,6 +40,7 @@
{% connectorInfoCard name="SingleStore" stage="PROD" href="/connectors/database/singlestore" platform="OpenMetadata" / %}
{% connectorInfoCard name="Snowflake" stage="PROD" href="/connectors/database/snowflake" platform="OpenMetadata" / %}
{% connectorInfoCard name="SQLite" stage="PROD" href="/connectors/database/sqlite" platform="OpenMetadata" / %}
+{% connectorInfoCard name="Synapse" stage="PROD" href="/connectors/database/synapse" platform="Collate" / %}
{% connectorInfoCard name="S3 Datalake" stage="PROD" href="/connectors/database/s3-datalake" platform="OpenMetadata" / %}
{% connectorInfoCard name="Teradata" stage="PROD" href="/connectors/database/teradata" platform="OpenMetadata" / %}
{% connectorInfoCard name="Trino" stage="PROD" href="/connectors/database/trino" platform="OpenMetadata" / %}
diff --git a/openmetadata-docs/content/partials/v1.7/connectors/metadata-ingestion-ui.md b/openmetadata-docs/content/partials/v1.7/connectors/metadata-ingestion-ui.md
index 167b5adcbdc6..3ef0542280ca 100644
--- a/openmetadata-docs/content/partials/v1.7/connectors/metadata-ingestion-ui.md
+++ b/openmetadata-docs/content/partials/v1.7/connectors/metadata-ingestion-ui.md
@@ -3,6 +3,7 @@
{% step srNumber=1 %}
{% stepDescription title="1. Visit the Services Page" %}
+Click `Settings` in the side navigation bar and then `Services`.
The first step is to ingest the metadata from your sources. To do that, you first need to create a Service connection first.
diff --git a/openmetadata-docs/content/partials/v1.7/connectors/yaml/auto-classification.md b/openmetadata-docs/content/partials/v1.7/connectors/yaml/auto-classification.md
index 42d7263a154a..25727a1f3121 100644
--- a/openmetadata-docs/content/partials/v1.7/connectors/yaml/auto-classification.md
+++ b/openmetadata-docs/content/partials/v1.7/connectors/yaml/auto-classification.md
@@ -151,4 +151,8 @@ After saving the YAML config, we will run the command the same way we did for th
metadata classify -c
```
-Note now instead of running `ingest`, we are using the `classify` command to select the Auto Classification workflow.
+{% note %}
+
+Now instead of running `ingest`, we are using the `classify` command to select the Auto Classification workflow.
+
+{% /note %}
diff --git a/openmetadata-docs/content/partials/v1.7/connectors/yaml/lineage.md b/openmetadata-docs/content/partials/v1.7/connectors/yaml/lineage.md
index 95ef73ee9f1c..a940b3c93d8f 100644
--- a/openmetadata-docs/content/partials/v1.7/connectors/yaml/lineage.md
+++ b/openmetadata-docs/content/partials/v1.7/connectors/yaml/lineage.md
@@ -3,7 +3,6 @@
After running a Metadata Ingestion workflow, we can run Lineage workflow.
While the `serviceName` will be the same to that was used in Metadata Ingestion, so the ingestion bot can get the `serviceConnection` details from the server.
-
### 1. Define the YAML Config
This is a sample config for BigQuery Lineage:
@@ -67,35 +66,35 @@ You can find all the definitions and types for the `sourceConfig` [here](https:
{% /codeInfo %}
-{% codeInfo srNumber=49 %}
+{% codeInfo srNumber=51 %}
**overrideViewLineage**: Set the 'Override View Lineage' toggle to control whether to override the existing view lineage.
{% /codeInfo %}
-{% codeInfo srNumber=51 %}
+{% codeInfo srNumber=52 %}
**processViewLineage**: Set the 'Process View Lineage' toggle to control whether to process view lineage.
{% /codeInfo %}
-{% codeInfo srNumber=52 %}
+{% codeInfo srNumber=53 %}
**processQueryLineage**: Set the 'Process Query Lineage' toggle to control whether to process query lineage.
{% /codeInfo %}
-{% codeInfo srNumber=53 %}
+{% codeInfo srNumber=54 %}
**processStoredProcedureLineage**: Set the 'Process Stored ProcedureLog Lineage' toggle to control whether to process stored procedure lineage.
{% /codeInfo %}
-{% codeInfo srNumber=54 %}
+{% codeInfo srNumber=55 %}
**threads**: Number of Threads to use in order to parallelize lineage ingestion.
@@ -180,12 +179,6 @@ source:
# - table4
```
-```yaml {% srNumber=49 %}
-sink:
- type: metadata-rest
- config: {}
-```
-
```yaml {% srNumber=51 %}
overrideViewLineage: false
```
@@ -206,6 +199,12 @@ sink:
threads: 1
```
+```yaml {% srNumber=49 %}
+sink:
+ type: metadata-rest
+ config: {}
+```
+
{% partial file="/v1.7/connectors/yaml/workflow-config.md" /%}
{% /codeBlock %}
diff --git a/openmetadata-docs/content/partials/v1.7/deployment/upgrade/upgrade-prerequisites.md b/openmetadata-docs/content/partials/v1.7/deployment/upgrade/upgrade-prerequisites.md
index 3004c74156d7..bcda1577075d 100644
--- a/openmetadata-docs/content/partials/v1.7/deployment/upgrade/upgrade-prerequisites.md
+++ b/openmetadata-docs/content/partials/v1.7/deployment/upgrade/upgrade-prerequisites.md
@@ -46,6 +46,23 @@ You can refer to the following guide to get more details about the backup and re
{% /inlineCallout %}
{% /inlineCalloutContainer %}
+## Understanding the "Running" State in OpenMetadata
+
+In OpenMetadata, the **"Running"** state indicates that the OpenMetadata server has received a response from Airflow confirming that a workflow is in progress. However, if Airflow unexpectedly stops or crashes before it can send a failure status update through the **Failure Callback**, OpenMetadata remains unaware of the workflow’s actual state. As a result, the workflow may appear to be stuck in **"Running"** even though it is no longer executing.
+
+This situation can also occur during an OpenMetadata upgrade. If an ingestion pipeline was running at the time of the upgrade and the process caused Airflow to shut down, OpenMetadata would not receive any further updates from Airflow. Consequently, the pipeline status remains **"Running"** indefinitely.
+
+{% image
+ src="/images/v1.7/deployment/upgrade/running-state-in-openmetadata.png"
+ alt="Running State in OpenMetadata"
+ caption="Running State in OpenMetadata" /%}
+
+### Expected Steps to Resolve
+To resolve this issue:
+- Ensure that Airflow is restarted properly after an unexpected shutdown.
+- Manually update the pipeline status if necessary.
+- Check Airflow logs to verify if the DAG execution was interrupted.
+
### Update `sort_buffer_size` (MySQL) or `work_mem` (Postgres)
Before running the migrations, it is important to update these parameters to ensure there are no runtime errors.
diff --git a/openmetadata-docs/content/v1.5.x/getting-started/day-1/index.md b/openmetadata-docs/content/v1.5.x/getting-started/day-1/index.md
index 46f809d3bf1a..167c7b44ae59 100644
--- a/openmetadata-docs/content/v1.5.x/getting-started/day-1/index.md
+++ b/openmetadata-docs/content/v1.5.x/getting-started/day-1/index.md
@@ -24,7 +24,7 @@ with links to more detailed documentation.
## Step 1: Set up a Data Connector
Once you’re able to login to your Collate instance, set up a data connector to start bringing metadata into Collate.
-There are [80+ turnkey connectors](/connectors) to various services: data warehouses, data lakes, databases, dashboards,
+There are [90+ turnkey connectors](/connectors) to various services: data warehouses, data lakes, databases, dashboards,
messaging services, pipelines, ML models, storage services, and other Metadata Services.
Connections to [custom data sources](/connectors/custom-connectors) can also be created via API.
diff --git a/openmetadata-docs/content/v1.5.x/getting-started/index.md b/openmetadata-docs/content/v1.5.x/getting-started/index.md
index 4de859fbebf7..b3a0c941ecb6 100644
--- a/openmetadata-docs/content/v1.5.x/getting-started/index.md
+++ b/openmetadata-docs/content/v1.5.x/getting-started/index.md
@@ -7,7 +7,7 @@ collate: true
# Getting Started
Welcome to Collate's unified platform for data discovery, observability, and governance! Our platform centralizes all
-the context around your data to help you build high-quality and AI assets. This guide gives you all the information you
+the context around your data to help you build high-quality data and AI assets. This guide gives you all the information you
need to set up your Collate environment in 30 minutes.
## How Does Collate Work?
@@ -15,14 +15,14 @@ need to set up your Collate environment in 30 minutes.
Collate is designed for both technical and non-technical data practitioners to work together across a broad set of use cases,
including data discovery, lineage, observability, quality, collaboration, governance, and insights.
-A library of 80+ turnkey connectors is available to easily ingest metadata into Collate, such as data warehouses, data lakes,
+A library of 90+ turnkey connectors is available to easily ingest metadata into Collate, such as data warehouses, data lakes,
streaming, dashboards, ML models, and more. APIs are also available to easily ingest metadata from custom data sources.
Metadata from these different sources is organized into a Unified Metadata Graph, which provides a single, comprehensive
source of truth across your entire data estate.
This centralized information is surfaced through a unified user interface for all your use cases so that different data
practitioners no longer need to switch between different data catalogs, quality, or governance tools. Additionally,
-Collate can be extended through the application ecosystem, such as with AI productivity applications like MetaPilot,
+Collate can be extended through the application ecosystem, such as with AI productivity applications like Collate AI,
or with customer-built workflows to integrate Collate with your existing systems. These capabilities are built around
native collaboration capabilities for shared workflows across different teams so that every data practitioner can work
together: data platform, data governance, data scientist/analyst, and business user.
@@ -31,7 +31,7 @@ together: data platform, data governance, data scientist/analyst, and business u
Before we get started, here’s a quick summary of some of Collate’s main features:
-- **Discovery**: integrated catalog, quality, and glossary; natural language search, filtering, and faceting, 80+ turnkey data connectors, and MetaPilot AI Chatbot.
+- **Discovery**: integrated catalog, quality, and glossary; natural language search, filtering, and faceting, 90+ turnkey data connectors, and Collate AI Chatbot.
- **Lineage**: table and column-level lineage, automated data estate mapping and APIs, lineage layers and search, governance and PII automation and manual customization.
- **Observability**: alerting and notifications, incident management, third-party notifications, pipeline monitoring, root cause analysis, anomaly detection, data profiler.
- **Quality**: table and column test cases, no-code and SQL data quality tests, test suites, test case reporting, quality dashboards, widgets and data quality lineage maps.
diff --git a/openmetadata-docs/content/v1.5.x/quick-start/getting-started/day-1/index.md b/openmetadata-docs/content/v1.5.x/quick-start/getting-started/day-1/index.md
index 5a4af7e59086..1dd42a667d1f 100644
--- a/openmetadata-docs/content/v1.5.x/quick-start/getting-started/day-1/index.md
+++ b/openmetadata-docs/content/v1.5.x/quick-start/getting-started/day-1/index.md
@@ -7,10 +7,10 @@ slug: /quick-start/getting-started/day-1
Get started with your OpenMetadata service in a few simple steps:
-1. Set up a Data Connector: Connect your data sources to begin collecting metadata.
-2. Ingest Metadata: Run the metadata ingestion process to gather and push data insights.
-3. Invite Users: Add team members to collaborate and manage metadata together.
-4. Explore the Features: Dive into OpenMetadata's extensive feature set to unlock the full potential of your data.
+1. **Set up a Data Connector**: Connect your data sources to begin collecting metadata.
+2. **Ingest Metadata**: Run the metadata ingestion process to gather and push data insights.
+3. **Invite Users**: Add team members to collaborate and manage metadata together.
+4. **Explore the Features**: Dive into OpenMetadata's extensive feature set to unlock the full potential of your data.
**Ready to begin? Let's get started!**
@@ -20,7 +20,7 @@ You should receive your initial OpenMetadata credentials from OpenMetadata suppo
## Step 1: Set up a Data Connector
-Once you have logged into your OpenMetadata instance, set up a data connector to start ingesting metadata. OpenMetadata provides [80+ turnkey connectors](/connectors) for a wide range of services, including:
+Once you have logged into your OpenMetadata instance, set up a data connector to start ingesting metadata. OpenMetadata provides [90+ turnkey connectors](/connectors) for a wide range of services, including:
- Databases
- Dashboards
diff --git a/openmetadata-docs/content/v1.5.x/quick-start/getting-started/index.md b/openmetadata-docs/content/v1.5.x/quick-start/getting-started/index.md
index 778794a85fd3..7130d1987f6a 100644
--- a/openmetadata-docs/content/v1.5.x/quick-start/getting-started/index.md
+++ b/openmetadata-docs/content/v1.5.x/quick-start/getting-started/index.md
@@ -11,9 +11,9 @@ Welcome to OpenMetadata's unified platform for data discovery, observability, an
OpenMetadata is designed to support both technical and non-technical data practitioners across various use cases, including data discovery, lineage, observability, quality, collaboration, governance, and insights.
-The platform includes a library of 80+ turnkey connectors to easily ingest metadata from sources such as data warehouses, data lakes, streaming platforms, dashboards, and ML models. For custom data sources, APIs are available to streamline metadata ingestion. Metadata from these sources is organized into a Unified Metadata Graph, providing a single, comprehensive source of truth for your entire data estate.
+The platform includes a library of 90+ turnkey connectors to easily ingest metadata from sources such as data warehouses, data lakes, streaming platforms, dashboards, and ML models. For custom data sources, APIs are available to streamline metadata ingestion. Metadata from these sources is organized into a Unified Metadata Graph, providing a single, comprehensive source of truth for your entire data estate.
-This centralized metadata is accessible through a unified user interface, eliminating the need for practitioners to switch between multiple catalogs, quality, or governance tools. OpenMetadata can also be extended with applications, such as AI-driven productivity tools like MetaPilot, or through custom-built workflows that integrate the platform with existing systems.
+This centralized metadata is accessible through a unified user interface, eliminating the need for practitioners to switch between multiple catalogs, quality, or governance tools. OpenMetadata can also be extended with applications, such as AI-driven productivity tools like Collate AI, or through custom-built workflows that integrate the platform with existing systems.
The platform’s native collaboration features support shared workflows, enabling different teams—data platform engineers, governance professionals, data scientists/analysts, and business users—to collaborate effectively in a single environment.
## Key Features of OpenMetadata
@@ -23,7 +23,7 @@ Before we get started, here’s a quick summary of some of OpenMetadata’s main
### Discovery
- Integrated catalog, data quality, and glossary
- Natural language search, filtering, and faceting
-- 80+ turnkey data connectors
+- 90+ turnkey data connectors
### Lineage
- Table and column-level lineage
diff --git a/openmetadata-docs/content/v1.5.x/releases/releases/index.md b/openmetadata-docs/content/v1.5.x/releases/releases/index.md
index 3320ea48721d..f1f8b529900e 100644
--- a/openmetadata-docs/content/v1.5.x/releases/releases/index.md
+++ b/openmetadata-docs/content/v1.5.x/releases/releases/index.md
@@ -8,7 +8,7 @@ slug: /releases/all-releases
{% note %}
The OpenMetadata community is on a monthly release cadence. At every 4-5 weeks we will be releasing a new
-version. To see what's coming in next releases, please check our [Roadmap](/releases/roadmap) section.
+version. To see what's coming in next releases, please check our {% collateContent %}[Roadmap](https://www.getcollate.io/roadmap){% /collateContent %}{% ossContent %}[Roadmap](/roadmap){% /ossContent %} section.
{% /note %}
@@ -542,7 +542,7 @@ To continue pursuing this objective, the application was completely refactored t
## Ingestion Connectors
-80+ connectors to help teams to centralize metadata. We continue to push the boundaries of this mission, in
+90+ connectors to help teams to centralize metadata. We continue to push the boundaries of this mission, in
- **Apache Flink** as a Pipeline Connector
- **SAP ERP**, after a long and successful collaboration with our community and SAP experts
diff --git a/openmetadata-docs/content/v1.6.x/collate-menu.md b/openmetadata-docs/content/v1.6.x/collate-menu.md
index ff9212d46b90..29bddb700981 100644
--- a/openmetadata-docs/content/v1.6.x/collate-menu.md
+++ b/openmetadata-docs/content/v1.6.x/collate-menu.md
@@ -216,6 +216,10 @@ site_menu:
url: /connectors/database/s3-datalake/yaml
- category: Connectors / Database / S3 Datalake / Troubleshooting
url: /connectors/database/s3-datalake/troubleshooting
+ - category: Connectors / Database / Teradata
+ url: /connectors/database/teradata
+ - category: Connectors / Database / Teradata / Run Externally
+ url: /connectors/database/teradata/yaml
- category: Connectors / Database / Trino
url: /connectors/database/trino
- category: Connectors / Database / Trino / Run Externally
@@ -626,6 +630,8 @@ site_menu:
url: /how-to-guides/data-discovery/details
- category: How-to Guides / Data Discovery / Add Complex Queries using Advanced Search
url: /how-to-guides/data-discovery/advanced
+ - category: How-to Guides / Data Discovery / Troubleshooting
+ url: /how-to-guides/data-discovery/troubleshooting
- category: How-to Guides / Data Discovery / Bulk Upload Data Assets
url: /how-to-guides/data-discovery/bulk-upload
- category: How-to Guides / Data Discovery / How to Bulk Import Data Asset
diff --git a/openmetadata-docs/content/v1.6.x/connectors/database/db2/index.md b/openmetadata-docs/content/v1.6.x/connectors/database/db2/index.md
index ed64e43dd0c0..affc9b5ca3ea 100644
--- a/openmetadata-docs/content/v1.6.x/connectors/database/db2/index.md
+++ b/openmetadata-docs/content/v1.6.x/connectors/database/db2/index.md
@@ -98,6 +98,21 @@ If you are using DB2 for IBM i:
- In Host and Port you should not add the Port Number.
{% /note %}
+{% note %}
+If you have a **db2jcc_license_cisuz.jar** file, it will not work with **ibm_db**. This file is a **Db2 Connect** license for the Java Driver.
+For **non-Java drivers**, such as the Python Client used in OpenMetadata ingestion, a **Db2 Connect** client-side license is required, typically named **db2con*.lic**.
+
+The **db2jcc_license_cisuz.jar** is specifically for Java-based clients, whereas OpenMetadata ingestion operates with a Python Client, making the `.jar` file incompatible.
+
+For activating a **non-Java license** for Db2 Connect **Application Server Edition**, **Advanced Application Server Edition**, **Enterprise Edition**, or **Trial**, follow these steps:
+- Download the **license activation kit** from IBM Passport Advantage: [IBM PPA](https://www.ibm.com/software/passportadvantage/pao_customer.html).
+- Unzip the package and locate the **non-Java license file** (e.g., `db2consv_ee.lic`).
+- Apply the `.lic` file to activate the license.
+
+For further reference, check this IBM post: [Everything About Db2 Connect Licensing](https://community.ibm.com/community/user/datamanagement/blogs/shilu-mathai2/2023/05/05/everything-about-db2-connect-licensing).
+
+{% /note %}
+
{% partial file="/v1.6/connectors/database/advanced-configuration.md" /%}
{% /extraContent %}
diff --git a/openmetadata-docs/content/v1.6.x/connectors/database/mongodb/yaml.md b/openmetadata-docs/content/v1.6.x/connectors/database/mongodb/yaml.md
index 660fbc964fd6..05fa8a104147 100644
--- a/openmetadata-docs/content/v1.6.x/connectors/database/mongodb/yaml.md
+++ b/openmetadata-docs/content/v1.6.x/connectors/database/mongodb/yaml.md
@@ -351,7 +351,7 @@ Here we are also importing all the basic requirements to parse YAMLs, handle dat
import yaml
from datetime import timedelta
from airflow import DAG
-from metadata.profiler.api.workflow import ProfilerWorkflow
+from metadata.workflow.profiler import ProfilerWorkflow
try:
from airflow.operators.python import PythonOperator
diff --git a/openmetadata-docs/content/v1.6.x/connectors/database/mysql/yaml.md b/openmetadata-docs/content/v1.6.x/connectors/database/mysql/yaml.md
index d5225a37c150..eea7d6f4b08d 100644
--- a/openmetadata-docs/content/v1.6.x/connectors/database/mysql/yaml.md
+++ b/openmetadata-docs/content/v1.6.x/connectors/database/mysql/yaml.md
@@ -191,8 +191,8 @@ For a simple, local installation using our docker containers, this looks like:
```yaml {% srNumber=40 %}
source:
- type: mssql-lineage
- serviceName: local_mssql
+ type: mysql-lineage
+ serviceName: local_mysql
sourceConfig:
config:
type: DatabaseLineage
@@ -243,12 +243,6 @@ source:
# - table4
```
-```yaml {% srNumber=49 %}
-sink:
- type: metadata-rest
- config: {}
-```
-
```yaml {% srNumber=51 %}
overrideViewLineage: false
```
@@ -269,6 +263,12 @@ sink:
threads: 1
```
+```yaml {% srNumber=49 %}
+sink:
+ type: metadata-rest
+ config: {}
+```
+
{% partial file="/v1.6/connectors/yaml/workflow-config.md" /%}
{% /codeBlock %}
diff --git a/openmetadata-docs/content/v1.6.x/connectors/database/snowflake/index.md b/openmetadata-docs/content/v1.6.x/connectors/database/snowflake/index.md
index c6f7ab738035..408b6b0b12ef 100644
--- a/openmetadata-docs/content/v1.6.x/connectors/database/snowflake/index.md
+++ b/openmetadata-docs/content/v1.6.x/connectors/database/snowflake/index.md
@@ -114,6 +114,9 @@ You can find more information about the `account_usage` schema [here](https://do
- **Include Temporary and Transient Tables**:
Optional configuration for ingestion of `TRANSIENT` and `TEMPORARY` tables, By default, it will skip the `TRANSIENT` and `TEMPORARY` tables.
- **Client Session Keep Alive**: Optional Configuration to keep the session active in case the ingestion job runs for longer duration.
+- **Account Usage Schema Name**: Full name of account usage schema, used in case your used do not have direct access to `SNOWFLAKE.ACCOUNT_USAGE` schema. In such case you can replicate tables `QUERY_HISTORY`, `TAG_REFERENCES`, `PROCEDURES`, `FUNCTIONS` to a custom schema let's say `CUSTOM_DB.CUSTOM_SCHEMA` and provide the same name in this field.
+
+When using this field make sure you have all these tables available within your custom schema `QUERY_HISTORY`, `TAG_REFERENCES`, `PROCEDURES`, `FUNCTIONS`.
{% partial file="/v1.6/connectors/database/advanced-configuration.md" /%}
diff --git a/openmetadata-docs/content/v1.6.x/connectors/database/snowflake/yaml.md b/openmetadata-docs/content/v1.6.x/connectors/database/snowflake/yaml.md
index 8a824e07928b..bd10edc7097b 100644
--- a/openmetadata-docs/content/v1.6.x/connectors/database/snowflake/yaml.md
+++ b/openmetadata-docs/content/v1.6.x/connectors/database/snowflake/yaml.md
@@ -150,6 +150,14 @@ This is a sample config for Snowflake:
{% /codeInfo %}
+{% codeInfo srNumber=40 %}
+
+**accountUsageSchema**: Full name of account usage schema, used in case your used do not have direct access to `SNOWFLAKE.ACCOUNT_USAGE` schema. In such case you can replicate tables `QUERY_HISTORY`, `TAG_REFERENCES`, `PROCEDURES`, `FUNCTIONS` to a custom schema let's say `CUSTOM_DB.CUSTOM_SCHEMA` and provide the same name in this field.
+
+When using this field make sure you have all these tables available within your custom schema `QUERY_HISTORY`, `TAG_REFERENCES`, `PROCEDURES`, `FUNCTIONS`.
+
+{% /codeInfo %}
+
{% codeInfo srNumber=6 %}
**includeTransientTables**: Optional configuration for ingestion of TRANSIENT and TEMPORARY tables, By default, it will skip the TRANSIENT and TEMPORARY tables.
@@ -231,6 +239,9 @@ source:
```yaml {% srNumber=5 %}
# database:
```
+```yaml {% srNumber=40 %}
+ # accountUsageSchema: SNOWFLAKE.ACCOUNT_USAGE
+```
```yaml {% srNumber=6 %}
includeTransientTables: false
```
diff --git a/openmetadata-docs/content/v1.6.x/connectors/pipeline/matillion/index.md b/openmetadata-docs/content/v1.6.x/connectors/pipeline/matillion/index.md
index 971b25269adf..9906fe618fa6 100644
--- a/openmetadata-docs/content/v1.6.x/connectors/pipeline/matillion/index.md
+++ b/openmetadata-docs/content/v1.6.x/connectors/pipeline/matillion/index.md
@@ -30,6 +30,8 @@ Configure and schedule Matillion metadata and profiler workflows from the OpenMe
To extract metadata from Matillion, you need to create a user with the following permissions:
- `API` Permission ( While Creating the User, from Admin -> User )
+- To retrieve lineage data, the user must be granted [Component-level permissions](https://docs.matillion.com/metl/docs/2932106/#component).
+- To enable lineage tracking in Matillion, **Matillion Enterprise Mode** is required. For detailed setup instructions and further information, refer to the official documentation: [Matillion Lineage Documentation](https://docs.matillion.com/metl/docs/2881895/).
### Matillion Versions
diff --git a/openmetadata-docs/content/v1.6.x/deployment/oss-security.md b/openmetadata-docs/content/v1.6.x/deployment/oss-security.md
new file mode 100644
index 000000000000..d0bd56e5c4e5
--- /dev/null
+++ b/openmetadata-docs/content/v1.6.x/deployment/oss-security.md
@@ -0,0 +1,44 @@
+---
+title: OSS Security Best Practices
+slug: /deployment/oss-security
+collate: false
+---
+
+# OSS Security
+
+## Encryption of Connection Credentials
+
+OpenMetadata ensures that sensitive information, such as passwords and connection secrets, is securely stored.
+
+- **Encryption Algorithm**: OpenMetadata uses **Fernet encryption** to encrypt secrets and passwords before storing them in the database.
+- **Fernet Encryption Details**:
+ - Uses **AES-128 in CBC mode** with a strong key-based approach.
+ - **Not based on hashing or salting**, but rather an encryption/decryption method with a symmetric key.
+- **Secrets Manager Support**:
+ - Users can **avoid storing credentials** in OpenMetadata by configuring an external **Secrets Manager**.
+ - More details on setting up a Secrets Manager can be found here:
+ 🔗 [Secrets Manager Documentation](https://docs.open-metadata.org/latest/deployment/secrets-manager)
+
+## Secure Connections to Data Sources
+
+OpenMetadata supports **encrypted connections** to various databases and services.
+
+- **SSL/TLS Support**:
+ - OpenMetadata allows users to configure **SSL/TLS encryption** for secure data transmission.
+ - Users can specify **SSL modes** and provide **CA certificates** for SSL validation.
+- **How to Enable SSL?**
+ - Each connector supports different SSL configurations.
+ - Follow the detailed guide for enabling SSL in OpenMetadata:
+ 🔗 [Enable SSL in OpenMetadata](https://docs.open-metadata.org/latest/deployment/security/enable-ssl)
+
+## **Additional Security Measures**
+
+- **Role-Based Access Control (RBAC)**: OpenMetadata allows administrators to define user roles and permissions.
+- **Authentication & Authorization**: OpenMetadata supports integration with OAuth, SAML, and LDAP for secure authentication.
+- **Data Access Control**: Users can restrict access to metadata based on policies and governance rules.
+
+{% note %}
+- **Passwords and secrets are securely encrypted** using **Fernet encryption**.
+- **Connections to data sources can be encrypted** using **SSL/TLS**.
+- **Secrets Managers** can be used to manage credentials externally.
+{% /note %}
diff --git a/openmetadata-docs/content/v1.6.x/getting-started/day-1/index.md b/openmetadata-docs/content/v1.6.x/getting-started/day-1/index.md
index d423c6889b15..441f6ebb025f 100644
--- a/openmetadata-docs/content/v1.6.x/getting-started/day-1/index.md
+++ b/openmetadata-docs/content/v1.6.x/getting-started/day-1/index.md
@@ -8,10 +8,10 @@ collate: true
Get started with your Collate service in just few simple steps:
-1. Set up a Data Connector: Connect your data sources to begin collecting metadata.
-2. Ingest Metadata: Run the metadata ingestion to gather and push data insights.
-3. Invite Users: Add team members to collaborate and manage metadata together.
-4. Explore the Features: Dive into Collate's rich feature set to unlock the full potential of your data.
+1. **Set up a Data Connector**: Connect your data sources to begin collecting metadata.
+2. **Ingest Metadata**: Run the metadata ingestion to gather and push data insights.
+3. **Invite Users**: Add team members to collaborate and manage metadata together.
+4. **Explore the Features**: Dive into Collate's rich feature set to unlock the full potential of your data.
**Ready to begin? Let's get started!**
@@ -24,7 +24,7 @@ with links to more detailed documentation.
## Step 1: Set up a Data Connector
Once you’re able to login to your Collate instance, set up a data connector to start bringing metadata into Collate.
-There are [80+ turnkey connectors](/connectors) to various services: data warehouses, data lakes, databases, dashboards,
+There are [90+ turnkey connectors](/connectors) to various services: data warehouses, data lakes, databases, dashboards,
messaging services, pipelines, ML models, storage services, and other Metadata Services.
Connections to [custom data sources](/connectors/custom-connectors) can also be created via API.
diff --git a/openmetadata-docs/content/v1.6.x/getting-started/index.md b/openmetadata-docs/content/v1.6.x/getting-started/index.md
index 4de859fbebf7..065744ff02ef 100644
--- a/openmetadata-docs/content/v1.6.x/getting-started/index.md
+++ b/openmetadata-docs/content/v1.6.x/getting-started/index.md
@@ -7,7 +7,7 @@ collate: true
# Getting Started
Welcome to Collate's unified platform for data discovery, observability, and governance! Our platform centralizes all
-the context around your data to help you build high-quality and AI assets. This guide gives you all the information you
+the context around your data to help you build high-quality data and AI assets. This guide gives you all the information you
need to set up your Collate environment in 30 minutes.
## How Does Collate Work?
@@ -15,7 +15,7 @@ need to set up your Collate environment in 30 minutes.
Collate is designed for both technical and non-technical data practitioners to work together across a broad set of use cases,
including data discovery, lineage, observability, quality, collaboration, governance, and insights.
-A library of 80+ turnkey connectors is available to easily ingest metadata into Collate, such as data warehouses, data lakes,
+A library of 90+ turnkey connectors is available to easily ingest metadata into Collate, such as data warehouses, data lakes,
streaming, dashboards, ML models, and more. APIs are also available to easily ingest metadata from custom data sources.
Metadata from these different sources is organized into a Unified Metadata Graph, which provides a single, comprehensive
source of truth across your entire data estate.
@@ -31,7 +31,7 @@ together: data platform, data governance, data scientist/analyst, and business u
Before we get started, here’s a quick summary of some of Collate’s main features:
-- **Discovery**: integrated catalog, quality, and glossary; natural language search, filtering, and faceting, 80+ turnkey data connectors, and MetaPilot AI Chatbot.
+- **Discovery**: integrated catalog, quality, and glossary; natural language search, filtering, and faceting, 90+ turnkey data connectors, and MetaPilot AI Chatbot.
- **Lineage**: table and column-level lineage, automated data estate mapping and APIs, lineage layers and search, governance and PII automation and manual customization.
- **Observability**: alerting and notifications, incident management, third-party notifications, pipeline monitoring, root cause analysis, anomaly detection, data profiler.
- **Quality**: table and column test cases, no-code and SQL data quality tests, test suites, test case reporting, quality dashboards, widgets and data quality lineage maps.
diff --git a/openmetadata-docs/content/v1.6.x/how-to-guides/admin-guide/roles-policies/authorization.md b/openmetadata-docs/content/v1.6.x/how-to-guides/admin-guide/roles-policies/authorization.md
index fd765e58c3dc..73a24e20e89e 100644
--- a/openmetadata-docs/content/v1.6.x/how-to-guides/admin-guide/roles-policies/authorization.md
+++ b/openmetadata-docs/content/v1.6.x/how-to-guides/admin-guide/roles-policies/authorization.md
@@ -34,6 +34,7 @@ Here are some examples of conditions.
| **matchAllTags(tagFqn, [tagFqn…])** | Returns true if the resource has all the tags from the tag list. |
| **matchAnyTag(tagFqn, [tagFqn…])** | Returns true if the resource has any of the tags from the tag list. |
| **matchTeam()** | Returns true if the user belongs to the team that owns the resource. |
+| **hasDomain()** | Returns true if the logged in user is the has domain access of the entity being accessed |
Conditions are used to assess DataAsset like Tables/Topics/Dashboards etc.. for specific attributes.
diff --git a/openmetadata-docs/content/v1.6.x/how-to-guides/data-discovery/troubleshooting.md b/openmetadata-docs/content/v1.6.x/how-to-guides/data-discovery/troubleshooting.md
new file mode 100644
index 000000000000..5122d4e01d32
--- /dev/null
+++ b/openmetadata-docs/content/v1.6.x/how-to-guides/data-discovery/troubleshooting.md
@@ -0,0 +1,42 @@
+---
+title: Troubleshooting for Export issue
+slug: /how-to-guides/data-discovery/troubleshooting
+---
+
+# Troubleshooting Export Issue
+When attempting to export a **CSV file for a Glossary**, the process gets stuck on the message **"Export initiated successfully."** and never completes. The file is not downloaded, and the export button remains disabled.
+
+This issue may occur if **WebSockets are blocked** in your network setup due to a **proxy** or **load balancer** configuration. OpenMetadata relies on WebSockets for real-time communication, and if they are blocked, the export process cannot complete.
+
+## Troubleshooting Steps
+
+### Step 1: Check for Load Balancer or Proxy
+
+If your setup includes a **load balancer** or **proxy**, verify whether WebSockets are being blocked.
+
+1. Run the following API request to check the export status:
+
+```bash
+curl -X GET "https:///api/v1/glossaries/name//exportAsync"
+```
+
+If the response does not return a file and remains in an active state indefinitely, WebSockets might be blocked.
+
+### Step 2: Verify WebSocket Connectivity
+
+1. Open the Developer Tools in your browser (F12 or Ctrl + Shift + I in Chrome).
+2. Navigate to the Network tab.
+3. Filter requests by WebSockets (WS).
+4. Check if WebSocket requests to OpenMetadata (wss://) are blocked, failing, or not established.
+
+### Step 3: Adjust WebSocket Settings in Your Proxy
+
+If WebSockets are blocked, update your proxy configuration to allow WebSocket traffic.
+
+### Step 4: Restart Services and Verify
+
+1. Restart your proxy or load balancer after making the configuration changes.
+2. Clear browser cache and cookies.
+3. Retry the CSV export in OpenMetadata.
+
+Once WebSockets are enabled in the proxy settings, the glossary export should complete successfully, and the CSV file should be available for download.
diff --git a/openmetadata-docs/content/v1.6.x/how-to-guides/data-governance/automation/index.md b/openmetadata-docs/content/v1.6.x/how-to-guides/data-governance/automation/index.md
index 7409a360ff06..6f047c6242cc 100644
--- a/openmetadata-docs/content/v1.6.x/how-to-guides/data-governance/automation/index.md
+++ b/openmetadata-docs/content/v1.6.x/how-to-guides/data-governance/automation/index.md
@@ -23,7 +23,23 @@ Managing metadata manually can be challenging, particularly in dynamic environme
## Key Use Cases for Collate Automations
-### 1. Bulk Ownership and Domain Assignment
+### 1. Bulk Description
+
+{% image
+src="/images/v1.6/how-to-guides/governance/automator-description.png"
+alt="Getting started with Automation"
+caption="Getting started with Automation"
+/%}
+
+- **Problem**: Many datasets lack descriptions, making it difficult for users to understand the data's purpose and contents. Sometimes, the same column description needs to be added to multiple datasets.
+- **Solution**: Automations can bulk-apply descriptions to tables and columns, ensuring that all data assets are consistently documented.
+- **Benefit**: This use case improves data discoverability and understanding, making it easier for users to find and use the data effectively.
+
+For the Action Configuration:
+- **Apply to Children**: Lets you apply the description to the selected child assets (e.g., columns) within an asset.
+- **Overwrite Metadata**: Allows you to overwrite existing descriptions with the new description. Otherwise, we will only apply the description to empty tables or columns.
+
+### 2. Bulk Ownership and Domain Assignment
{% image
src="/images/v1.6/how-to-guides/governance/bulk-ownership-and.png"
@@ -35,7 +51,10 @@ caption="Getting started with Automation"
- **Solution**: Automations can bulk-assign ownership and domains to datasets, ensuring all data assets are correctly categorized and owned. This process can be applied to tables, schemas, or other assets within Collate.
- **Benefit**: This use case ensures data assets have a designated owner and are organized under the appropriate domain, making data more discoverable and accountable.
-### 2. Bulk Tagging and Glossary Term Assignment
+For the Action Configuration:
+- **Overwrite Metadata**: Allows you to overwrite existing owner or domain with the configured one. Otherwise, we will only apply the owner or domain to assets that do not have an existing owner or domain.
+
+### 3. Bulk Tagging and Glossary Term Assignment
{% image
src="/images/v1.6/how-to-guides/governance/bulk-tagging-glossary.png"
@@ -47,7 +66,12 @@ caption="Getting started with Automation"
- **Solution**: Automations allow users to bulk-apply tags (e.g., PII) or glossary terms (e.g., Customer ID) to specific datasets, ensuring uniformity across the platform.
- **Benefit**: This automation reduces the risk of missing important tags like PII-sensitive and ensures that key metadata elements are applied consistently across datasets.
-### 3. Metadata Propagation via Lineage
+For the Action Configuration:
+- **Apply to Children**: Lets you apply the Tags or Glossary Terms to the selected child assets (e.g., columns) within an asset.
+- **Overwrite Metadata**: Allows you to overwrite existing Tags or Terms with the configured one. Otherwise, we will add the new Tags or Terms to the existing ones.
+
+
+### 4. Metadata Propagation via Lineage
{% image
src="/images/v1.6/how-to-guides/governance/metadata-propogation.png"
@@ -59,7 +83,19 @@ caption="Getting started with Automation"
- **Solution**: Use automations to propagate metadata across related datasets, ensuring that all relevant data inherits the correct metadata properties from the source dataset.
- **Benefit**: Metadata consistency is ensured across the entire data lineage, reducing the need for manual updates and maintaining a single source of truth.
-### 4. Automatic PII Detection and Tagging
+For the Action Configuration:
+1. First, we can choose if we want the propagation to happen at the Parent level (e.g., Table), Column Level, or both. This can be configured by selecting **Propagate Parent** and/or **Propagate Column Level**.
+2. Then, we can control which pieces of metadata we want to propagate via lineage:
+ - **Propagate Description**: Propagates the description from the source asset to the downstream assets. Works for both parent and column-level.
+ - **Propagate Tags**: Propagates the tags from the source asset to the downstream assets. Works for both parent and column-level.
+ - **Propagate Glossary Terms**: Propagates the glossary terms from the source asset to the downstream assets. Works for both parent and column-level.
+ - **Propagate Owners**: Only applicable for Parent assets. Propagates the owner information to downstream assets.
+ - **Propagate Tier**: Only applicable for Parent assets. Propagated the tier information to downstream assets.
+
+As with other actions, you can choose to **Overwrite Metadata** or keep the existing metadata and only apply the new metadata to assets that do not have the metadata already.
+
+
+### 5. Automatic PII Detection and Tagging
{% image
src="/images/v1.6/how-to-guides/governance/automatic-detection.png"
@@ -67,6 +103,15 @@ alt="Getting started with Automation"
caption="Getting started with Automation"
/%}
+{% note noteType="Warning" %}
+
+Note that we recommend using the **Auto Classification** workflow instead, which allows you to discover PII data automatically,
+even in cases where you don't want to ingest the Sample Data into Collate.
+
+Note that this automation, the ML Tagging, will be deprecated in future releases.
+
+{% /note %}
+
- **Problem**: Manually identifying and tagging Personally Identifiable Information (PII) across large datasets is labor-intensive and prone to errors.
- **Solution**: Automations can automatically detect PII data (e.g., emails, usernames) and apply relevant tags to ensure that sensitive data is flagged appropriately for compliance.
- **Benefit**: Ensures compliance with data protection regulations by consistently tagging sensitive data, reducing the risk of non-compliance.
diff --git a/openmetadata-docs/content/v1.6.x/how-to-guides/data-governance/classification/Auto Classification/auto-pii-tagging.md b/openmetadata-docs/content/v1.6.x/how-to-guides/data-governance/classification/Auto Classification/auto-pii-tagging.md
index 9bf30da68f38..c691a6062151 100644
--- a/openmetadata-docs/content/v1.6.x/how-to-guides/data-governance/classification/Auto Classification/auto-pii-tagging.md
+++ b/openmetadata-docs/content/v1.6.x/how-to-guides/data-governance/classification/Auto Classification/auto-pii-tagging.md
@@ -7,11 +7,6 @@ slug: /how-to-guides/data-governance/classification/auto/auto-pii-tagging
Auto PII tagging for Sensitive/NonSensitive at the column level is performed based on the two approaches described below.
-{% note %}
-PII Tagging is only available during `Profiler Ingestion`.
-{% /note %}
-
-
## Tagging logic
1. **Column Name Scanner**: We validate the column names of the table against a set of regex rules that help us identify
diff --git a/openmetadata-docs/content/v1.6.x/how-to-guides/data-governance/classification/Auto Classification/external-workflow.md b/openmetadata-docs/content/v1.6.x/how-to-guides/data-governance/classification/Auto Classification/external-workflow.md
index 2d82baac5abe..6c5c464f270b 100644
--- a/openmetadata-docs/content/v1.6.x/how-to-guides/data-governance/classification/Auto Classification/external-workflow.md
+++ b/openmetadata-docs/content/v1.6.x/how-to-guides/data-governance/classification/Auto Classification/external-workflow.md
@@ -42,6 +42,19 @@ The Auto Classification Workflow enables automatic tagging of sensitive informat
- When set to `true`, filtering patterns will be applied to the Fully Qualified Name of a table (e.g., `service_name.db_name.schema_name.table_name`).
- When set to `false`, filtering applies only to raw table names.
+## Auto Classification Workflow Execution
+
+To execute the **Auto Classification Workflow**, follow the steps below:
+
+### 1. Install the Required Python Package
+Ensure you have the correct OpenMetadata ingestion package installed, including the **PII Processor** module:
+
+```bash
+pip install "openmetadata-ingestion[pii-processor]"
+```
+## 2. Define and Execute the Python Workflow
+Instead of using a YAML configuration, use the AutoClassificationWorkflow from OpenMetadata to trigger the ingestion process programmatically.
+
## Sample Auto Classification Workflow yaml
```yaml
@@ -103,6 +116,14 @@ workflowConfig:
jwtToken: "eyJraWQiOiJHYjM4OWEtOWY3Ni1nZGpzLWE5MmotMDI0MmJrOTQzNTYiLCJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsImlzQm90IjpmYWxzZSwiaXNzIjoib3Blbi1tZXRhZGF0YS5vcmciLCJpYXQiOjE2NjM5Mzg0NjIsImVtYWlsIjoiYWRtaW5Ab3Blbm1ldGFkYXRhLm9yZyJ9.tS8um_5DKu7HgzGBzS1VTA5uUjKWOCU0B_j08WXBiEC0mr0zNREkqVfwFDD-d24HlNEbrqioLsBuFRiwIWKc1m_ZlVQbG7P36RUxhuv2vbSp80FKyNM-Tj93FDzq91jsyNmsQhyNv_fNr3TXfzzSPjHt8Go0FMMP66weoKMgW2PbXlhVKwEuXUHyakLLzewm9UMeQaEiRzhiTMU3UkLXcKbYEJJvfNFcLwSl9W8JCO_l0Yj3ud-qt_nQYEZwqW6u5nfdQllN133iikV4fM5QZsMCnm8Rq1mvLR0y9bmJiD7fwM1tmJ791TUWqmKaTnP49U493VanKpUAfzIiOiIbhg"
```
+### 3. Expected Outcome
+- Automatically classifies and tags sensitive data based on predefined patterns and confidence levels.
+- Improves metadata enrichment and enhances data governance practices.
+- Provides visibility into sensitive data across databases.
+This approach ensures that the Auto Classification Workflow is executed correctly using the appropriate OpenMetadata ingestion framework.
+
+{% partial file="/v1.6/connectors/yaml/auto-classification.md" variables={connector: "snowflake"} /%}
+
## Workflow Execution
### To Execute the Auto Classification Workflow:
diff --git a/openmetadata-docs/content/v1.6.x/menu.md b/openmetadata-docs/content/v1.6.x/menu.md
index a13449f2717a..b020a1eae5dd 100644
--- a/openmetadata-docs/content/v1.6.x/menu.md
+++ b/openmetadata-docs/content/v1.6.x/menu.md
@@ -223,6 +223,9 @@ site_menu:
- category: Deployment / Metrics
url: /deployment/metrics
+ - category: Deployment / OSS Security
+ url: /deployment/oss-security
+
- category: Connectors
url: /connectors
@@ -799,6 +802,8 @@ site_menu:
url: /how-to-guides/data-discovery/details
- category: How-to Guides / Data Discovery / Add Complex Queries using Advanced Search
url: /how-to-guides/data-discovery/advanced
+ - category: How-to Guides / Data Discovery / Troubleshooting
+ url: /how-to-guides/data-discovery/troubleshooting
- category: How-to Guides / Data Discovery / OpenMetadata Chrome Extension
url: /how-to-guides/data-discovery/openmetadata-extension
- category: How-to Guides / Data Discovery / OpenMetadata Chrome Extension / Viewing Activity Feeds and Managing Tasks
diff --git a/openmetadata-docs/content/v1.6.x/quick-start/getting-started/day-1/index.md b/openmetadata-docs/content/v1.6.x/quick-start/getting-started/day-1/index.md
index 6bc58494e729..14e6212205cd 100644
--- a/openmetadata-docs/content/v1.6.x/quick-start/getting-started/day-1/index.md
+++ b/openmetadata-docs/content/v1.6.x/quick-start/getting-started/day-1/index.md
@@ -7,10 +7,10 @@ slug: /quick-start/getting-started/day-1
Get started with your OpenMetadata service in a few simple steps:
-1. Set up a Data Connector: Connect your data sources to begin collecting metadata.
-2. Ingest Metadata: Run the metadata ingestion process to gather and push data insights.
-3. Invite Users: Add team members to collaborate and manage metadata together.
-4. Explore the Features: Dive into OpenMetadata's extensive feature set to unlock the full potential of your data.
+1. **Set up a Data Connector**: Connect your data sources to begin collecting metadata.
+2. **Ingest Metadata**: Run the metadata ingestion process to gather and push data insights.
+3. **Invite Users**: Add team members to collaborate and manage metadata together.
+4. **Explore the Features**: Dive into OpenMetadata's extensive feature set to unlock the full potential of your data.
**Ready to begin? Let's get started!**
@@ -20,7 +20,7 @@ You should receive your initial OpenMetadata credentials from OpenMetadata suppo
## Step 1: Set up a Data Connector
-Once you have logged into your OpenMetadata instance, set up a data connector to start ingesting metadata. OpenMetadata provides [80+ turnkey connectors](/connectors) for a wide range of services, including:
+Once you have logged into your OpenMetadata instance, set up a data connector to start ingesting metadata. OpenMetadata provides [90+ turnkey connectors](/connectors) for a wide range of services, including:
- Databases
- Dashboards
diff --git a/openmetadata-docs/content/v1.6.x/quick-start/getting-started/index.md b/openmetadata-docs/content/v1.6.x/quick-start/getting-started/index.md
index 778794a85fd3..fb8b26990b6b 100644
--- a/openmetadata-docs/content/v1.6.x/quick-start/getting-started/index.md
+++ b/openmetadata-docs/content/v1.6.x/quick-start/getting-started/index.md
@@ -11,7 +11,7 @@ Welcome to OpenMetadata's unified platform for data discovery, observability, an
OpenMetadata is designed to support both technical and non-technical data practitioners across various use cases, including data discovery, lineage, observability, quality, collaboration, governance, and insights.
-The platform includes a library of 80+ turnkey connectors to easily ingest metadata from sources such as data warehouses, data lakes, streaming platforms, dashboards, and ML models. For custom data sources, APIs are available to streamline metadata ingestion. Metadata from these sources is organized into a Unified Metadata Graph, providing a single, comprehensive source of truth for your entire data estate.
+The platform includes a library of 90+ turnkey connectors to easily ingest metadata from sources such as data warehouses, data lakes, streaming platforms, dashboards, and ML models. For custom data sources, APIs are available to streamline metadata ingestion. Metadata from these sources is organized into a Unified Metadata Graph, providing a single, comprehensive source of truth for your entire data estate.
This centralized metadata is accessible through a unified user interface, eliminating the need for practitioners to switch between multiple catalogs, quality, or governance tools. OpenMetadata can also be extended with applications, such as AI-driven productivity tools like MetaPilot, or through custom-built workflows that integrate the platform with existing systems.
The platform’s native collaboration features support shared workflows, enabling different teams—data platform engineers, governance professionals, data scientists/analysts, and business users—to collaborate effectively in a single environment.
@@ -23,7 +23,7 @@ Before we get started, here’s a quick summary of some of OpenMetadata’s main
### Discovery
- Integrated catalog, data quality, and glossary
- Natural language search, filtering, and faceting
-- 80+ turnkey data connectors
+- 90+ turnkey data connectors
### Lineage
- Table and column-level lineage
diff --git a/openmetadata-docs/content/v1.6.x/quick-start/local-docker-deployment.md b/openmetadata-docs/content/v1.6.x/quick-start/local-docker-deployment.md
index 9dc26222aaef..0003c3023b95 100644
--- a/openmetadata-docs/content/v1.6.x/quick-start/local-docker-deployment.md
+++ b/openmetadata-docs/content/v1.6.x/quick-start/local-docker-deployment.md
@@ -119,15 +119,15 @@ The latest version is at the top of the page
You can use the curl or wget command as well to fetch the docker compose files from your terminal -
```commandline
-curl -sL -o docker-compose.yml https://github.com/open-metadata/OpenMetadata/releases/download/1.6.1-release/docker-compose.yml
+curl -sL -o docker-compose.yml https://github.com/open-metadata/OpenMetadata/releases/download/1.6.3-release/docker-compose.yml
-curl -sL -o docker-compose-postgres.yml https://github.com/open-metadata/OpenMetadata/releases/download/1.6.1-release/docker-compose-postgres.yml
+curl -sL -o docker-compose-postgres.yml https://github.com/open-metadata/OpenMetadata/releases/download/1.6.3-release/docker-compose-postgres.yml
```
```commandline
-wget https://github.com/open-metadata/OpenMetadata/releases/download/1.6.1-release/docker-compose.yml
+wget https://github.com/open-metadata/OpenMetadata/releases/download/1.6.3-release/docker-compose.yml
-wget https://github.com/open-metadata/OpenMetadata/releases/download/1.6.1-release/docker-compose-postgres.yml
+wget https://github.com/open-metadata/OpenMetadata/releases/download/1.6.3-release/docker-compose-postgres.yml
```
### 3. Start the Docker Compose Services
@@ -166,10 +166,10 @@ You can validate that all containers are up by running with command `docker ps`.
```commandline
❯ docker ps
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
-470cc8149826 openmetadata/server:1.6.1 "./openmetadata-star…" 45 seconds ago Up 43 seconds 3306/tcp, 9200/tcp, 9300/tcp, 0.0.0.0:8585-8586->8585-8586/tcp openmetadata_server
-63578aacbff5 openmetadata/ingestion:1.6.1 "./ingestion_depende…" 45 seconds ago Up 43 seconds 0.0.0.0:8080->8080/tcp openmetadata_ingestion
+470cc8149826 openmetadata/server:1.6.3 "./openmetadata-star…" 45 seconds ago Up 43 seconds 3306/tcp, 9200/tcp, 9300/tcp, 0.0.0.0:8585-8586->8585-8586/tcp openmetadata_server
+63578aacbff5 openmetadata/ingestion:1.6.3 "./ingestion_depende…" 45 seconds ago Up 43 seconds 0.0.0.0:8080->8080/tcp openmetadata_ingestion
9f5ee8334f4b docker.elastic.co/elasticsearch/elasticsearch:7.16.3 "/tini -- /usr/local…" 45 seconds ago Up 44 seconds 0.0.0.0:9200->9200/tcp, 0.0.0.0:9300->9300/tcp openmetadata_elasticsearch
-08947ab3424b openmetadata/db:1.6.1 "/entrypoint.sh mysq…" 45 seconds ago Up 44 seconds (healthy) 3306/tcp, 33060-33061/tcp openmetadata_mysql
+08947ab3424b openmetadata/db:1.6.3 "/entrypoint.sh mysq…" 45 seconds ago Up 44 seconds (healthy) 3306/tcp, 33060-33061/tcp openmetadata_mysql
```
In a few seconds, you should be able to access the OpenMetadata UI at [http://localhost:8585](http://localhost:8585)
diff --git a/openmetadata-docs/content/v1.6.x/releases/releases/index.md b/openmetadata-docs/content/v1.6.x/releases/releases/index.md
index 056ea001a84f..71d7576eaef3 100644
--- a/openmetadata-docs/content/v1.6.x/releases/releases/index.md
+++ b/openmetadata-docs/content/v1.6.x/releases/releases/index.md
@@ -8,7 +8,7 @@ slug: /releases/all-releases
{% note %}
The OpenMetadata community is on a monthly release cadence. At every 4-5 weeks we will be releasing a new
-version. To see what's coming in next releases, please check our [Roadmap](/releases/roadmap) section.
+version. To see what's coming in next releases, please check our {% collateContent %}[Roadmap](https://www.getcollate.io/roadmap){% /collateContent %}{% ossContent %}[Roadmap](/roadmap){% /ossContent %} section.
{% /note %}
@@ -168,7 +168,7 @@ ER diagrams help you better understand and manage your data architecture by show
Organizations often struggle with data governance due to rigid, pre-defined manual workflows. OpenMetadata 1.6 introduces a new, automated data governance framework designed to be customized to each organization's needs.
-In Collate 1.6, the Glossary Approval Workflow has been migrated to this new framework. Now, you can create custom approval processes with specific conditions and rules and easily visualize them through intuitive workflow diagrams. You can also create smart approval processes for glossary terms with real-time state changes and task creation to save time and streamline work.
+In Collate 1.6, the Glossary Approval Workflow has been migrated to this new framework. Now, you can create custom approval processes with specific conditions and rules and easily visualize them through intuitive workflow diagrams. You can also create smart approval processes for glossary terms with real-time state changes and task creation to save time and streamline work.
## Data Certification Workflows for Automated Bronze, Silver, & Gold Data Standardization! (Collate)
@@ -202,7 +202,7 @@ OpenMetadata 1.6 extends Role-Based Access Control (RBAC) to search functionalit
## Expanded Connector Ecosystem and Diversity
-OpenMetadata's ingestion framework contains 80+ native connectors. These connectors are the foundation of the platform and bring in all the metadata your team needs: technical metadata, lineage, usage, profiling, etc.
+OpenMetadata's ingestion framework contains 90+ native connectors. These connectors are the foundation of the platform and bring in all the metadata your team needs: technical metadata, lineage, usage, profiling, etc.
We bring new connectors in each release, continuously expanding our coverage. This time, release 1.6 comes with seven new connectors:
@@ -770,7 +770,7 @@ To continue pursuing this objective, the application was completely refactored t
## Ingestion Connectors
-80+ connectors to help teams to centralize metadata. We continue to push the boundaries of this mission, in
+90+ connectors to help teams to centralize metadata. We continue to push the boundaries of this mission, in
- **Apache Flink** as a Pipeline Connector
- **SAP ERP**, after a long and successful collaboration with our community and SAP experts
diff --git a/openmetadata-docs/content/v1.7.x-SNAPSHOT/collate-menu.md b/openmetadata-docs/content/v1.7.x-SNAPSHOT/collate-menu.md
index bfc76e4c23ab..55adbb4271b5 100644
--- a/openmetadata-docs/content/v1.7.x-SNAPSHOT/collate-menu.md
+++ b/openmetadata-docs/content/v1.7.x-SNAPSHOT/collate-menu.md
@@ -224,6 +224,10 @@ site_menu:
url: /connectors/database/s3-datalake/yaml
- category: Connectors / Database / S3 Datalake / Troubleshooting
url: /connectors/database/s3-datalake/troubleshooting
+ - category: Connectors / Database / Teradata
+ url: /connectors/database/teradata
+ - category: Connectors / Database / Teradata / Run Externally
+ url: /connectors/database/teradata/yaml
- category: Connectors / Database / Trino
url: /connectors/database/trino
- category: Connectors / Database / Trino / Run Externally
@@ -634,6 +638,8 @@ site_menu:
url: /how-to-guides/data-discovery/details
- category: How-to Guides / Data Discovery / Add Complex Queries using Advanced Search
url: /how-to-guides/data-discovery/advanced
+ - category: How-to Guides / Data Discovery / Troubleshooting
+ url: /how-to-guides/data-discovery/troubleshooting
- category: How-to Guides / Data Discovery / Bulk Upload Data Assets
url: /how-to-guides/data-discovery/bulk-upload
- category: How-to Guides / Data Discovery / How to Bulk Import Data Asset
diff --git a/openmetadata-docs/content/v1.7.x-SNAPSHOT/connectors/database/db2/index.md b/openmetadata-docs/content/v1.7.x-SNAPSHOT/connectors/database/db2/index.md
index d489b105ae95..4dd22e7392e7 100644
--- a/openmetadata-docs/content/v1.7.x-SNAPSHOT/connectors/database/db2/index.md
+++ b/openmetadata-docs/content/v1.7.x-SNAPSHOT/connectors/database/db2/index.md
@@ -98,6 +98,20 @@ If you are using DB2 for IBM i:
- In Host and Port you should not add the Port Number.
{% /note %}
+{% note %}
+If you have a **db2jcc_license_cisuz.jar** file, it will not work with **ibm_db**. This file is a **Db2 Connect** license for the Java Driver.
+For **non-Java drivers**, such as the Python Client used in OpenMetadata ingestion, a **Db2 Connect** client-side license is required, typically named **db2con*.lic**.
+
+The **db2jcc_license_cisuz.jar** is specifically for Java-based clients, whereas OpenMetadata ingestion operates with a Python Client, making the `.jar` file incompatible.
+
+For activating a **non-Java license** for Db2 Connect **Application Server Edition**, **Advanced Application Server Edition**, **Enterprise Edition**, or **Trial**, follow these steps:
+- Download the **license activation kit** from IBM Passport Advantage: [IBM PPA](https://www.ibm.com/software/passportadvantage/pao_customer.html).
+- Unzip the package and locate the **non-Java license file** (e.g., `db2consv_ee.lic`).
+- Apply the `.lic` file to activate the license.
+
+For further reference, check this IBM post: [Everything About Db2 Connect Licensing](https://community.ibm.com/community/user/datamanagement/blogs/shilu-mathai2/2023/05/05/everything-about-db2-connect-licensing).
+{% /note %}
+
{% partial file="/v1.7/connectors/database/advanced-configuration.md" /%}
{% /extraContent %}
diff --git a/openmetadata-docs/content/v1.7.x-SNAPSHOT/connectors/database/mongodb/yaml.md b/openmetadata-docs/content/v1.7.x-SNAPSHOT/connectors/database/mongodb/yaml.md
index 2fc2e94fc6cb..34b900f559dd 100644
--- a/openmetadata-docs/content/v1.7.x-SNAPSHOT/connectors/database/mongodb/yaml.md
+++ b/openmetadata-docs/content/v1.7.x-SNAPSHOT/connectors/database/mongodb/yaml.md
@@ -351,7 +351,7 @@ Here we are also importing all the basic requirements to parse YAMLs, handle dat
import yaml
from datetime import timedelta
from airflow import DAG
-from metadata.profiler.api.workflow import ProfilerWorkflow
+from metadata.workflow.profiler import ProfilerWorkflow
try:
from airflow.operators.python import PythonOperator
diff --git a/openmetadata-docs/content/v1.7.x-SNAPSHOT/connectors/database/mysql/yaml.md b/openmetadata-docs/content/v1.7.x-SNAPSHOT/connectors/database/mysql/yaml.md
index 756a5f2b534f..87cc7afda4f0 100644
--- a/openmetadata-docs/content/v1.7.x-SNAPSHOT/connectors/database/mysql/yaml.md
+++ b/openmetadata-docs/content/v1.7.x-SNAPSHOT/connectors/database/mysql/yaml.md
@@ -191,8 +191,8 @@ For a simple, local installation using our docker containers, this looks like:
```yaml {% srNumber=40 %}
source:
- type: mssql-lineage
- serviceName: local_mssql
+ type: mysql-lineage
+ serviceName: local_mysql
sourceConfig:
config:
type: DatabaseLineage
@@ -243,12 +243,6 @@ source:
# - table4
```
-```yaml {% srNumber=49 %}
-sink:
- type: metadata-rest
- config: {}
-```
-
```yaml {% srNumber=51 %}
overrideViewLineage: false
```
@@ -269,6 +263,12 @@ sink:
threads: 1
```
+```yaml {% srNumber=49 %}
+sink:
+ type: metadata-rest
+ config: {}
+```
+
{% partial file="/v1.6/connectors/yaml/workflow-config.md" /%}
{% /codeBlock %}
diff --git a/openmetadata-docs/content/v1.7.x-SNAPSHOT/connectors/database/snowflake/index.md b/openmetadata-docs/content/v1.7.x-SNAPSHOT/connectors/database/snowflake/index.md
index 70bb0f68f8d6..d0649194a9c6 100644
--- a/openmetadata-docs/content/v1.7.x-SNAPSHOT/connectors/database/snowflake/index.md
+++ b/openmetadata-docs/content/v1.7.x-SNAPSHOT/connectors/database/snowflake/index.md
@@ -115,6 +115,9 @@ You can find more information about the `account_usage` schema [here](https://do
- **Include Temporary and Transient Tables**:
Optional configuration for ingestion of `TRANSIENT` and `TEMPORARY` tables, By default, it will skip the `TRANSIENT` and `TEMPORARY` tables.
- **Client Session Keep Alive**: Optional Configuration to keep the session active in case the ingestion job runs for longer duration.
+- **Account Usage Schema Name**: Full name of account usage schema, used in case your used do not have direct access to `SNOWFLAKE.ACCOUNT_USAGE` schema. In such case you can replicate tables `QUERY_HISTORY`, `TAG_REFERENCES`, `PROCEDURES`, `FUNCTIONS` to a custom schema let's say `CUSTOM_DB.CUSTOM_SCHEMA` and provide the same name in this field.
+
+When using this field make sure you have all these tables available within your custom schema `QUERY_HISTORY`, `TAG_REFERENCES`, `PROCEDURES`, `FUNCTIONS`.
{% partial file="/v1.7/connectors/database/advanced-configuration.md" /%}
diff --git a/openmetadata-docs/content/v1.7.x-SNAPSHOT/connectors/database/snowflake/yaml.md b/openmetadata-docs/content/v1.7.x-SNAPSHOT/connectors/database/snowflake/yaml.md
index e46e4d4f2c80..9ca018518eeb 100644
--- a/openmetadata-docs/content/v1.7.x-SNAPSHOT/connectors/database/snowflake/yaml.md
+++ b/openmetadata-docs/content/v1.7.x-SNAPSHOT/connectors/database/snowflake/yaml.md
@@ -150,6 +150,14 @@ This is a sample config for Snowflake:
{% /codeInfo %}
+{% codeInfo srNumber=40 %}
+
+**accountUsageSchema**: Full name of account usage schema, used in case your used do not have direct access to `SNOWFLAKE.ACCOUNT_USAGE` schema. In such case you can replicate tables `QUERY_HISTORY`, `TAG_REFERENCES`, `PROCEDURES`, `FUNCTIONS` to a custom schema let's say `CUSTOM_DB.CUSTOM_SCHEMA` and provide the same name in this field.
+
+When using this field make sure you have all these tables available within your custom schema `QUERY_HISTORY`, `TAG_REFERENCES`, `PROCEDURES`, `FUNCTIONS`.
+
+{% /codeInfo %}
+
{% codeInfo srNumber=6 %}
**includeTransientTables**: Optional configuration for ingestion of TRANSIENT and TEMPORARY tables, By default, it will skip the TRANSIENT and TEMPORARY tables.
@@ -231,6 +239,9 @@ source:
```yaml {% srNumber=5 %}
# database:
```
+```yaml {% srNumber=40 %}
+ # accountUsageSchema: SNOWFLAKE.ACCOUNT_USAGE
+```
```yaml {% srNumber=6 %}
includeTransientTables: false
```
diff --git a/openmetadata-docs/content/v1.7.x-SNAPSHOT/connectors/pipeline/matillion/index.md b/openmetadata-docs/content/v1.7.x-SNAPSHOT/connectors/pipeline/matillion/index.md
index 824d1d28c601..f1c0de671585 100644
--- a/openmetadata-docs/content/v1.7.x-SNAPSHOT/connectors/pipeline/matillion/index.md
+++ b/openmetadata-docs/content/v1.7.x-SNAPSHOT/connectors/pipeline/matillion/index.md
@@ -30,6 +30,8 @@ Configure and schedule Matillion metadata and profiler workflows from the OpenMe
To extract metadata from Matillion, you need to create a user with the following permissions:
- `API` Permission ( While Creating the User, from Admin -> User )
+- To retrieve lineage data, the user must be granted [Component-level permissions](https://docs.matillion.com/metl/docs/2932106/#component).
+- To enable lineage tracking in Matillion, **Matillion Enterprise Mode** is required. For detailed setup instructions and further information, refer to the official documentation: [Matillion Lineage Documentation](https://docs.matillion.com/metl/docs/2881895/).
### Matillion Versions
diff --git a/openmetadata-docs/content/v1.7.x-SNAPSHOT/deployment/oss-security.md b/openmetadata-docs/content/v1.7.x-SNAPSHOT/deployment/oss-security.md
new file mode 100644
index 000000000000..385e5f2fb8ee
--- /dev/null
+++ b/openmetadata-docs/content/v1.7.x-SNAPSHOT/deployment/oss-security.md
@@ -0,0 +1,44 @@
+---
+title: OSS Security Best Practices
+slug: /deployment/oss-security
+collate: false
+---
+
+# OSS Security
+
+## Encryption of Connection Credentials
+
+OpenMetadata ensures that sensitive information, such as passwords and connection secrets, is securely stored.
+
+- **Encryption Algorithm**: OpenMetadata uses **Fernet encryption** to encrypt secrets and passwords before storing them in the database.
+- **Fernet Encryption Details**:
+ - Uses **AES-128 in CBC mode** with a strong key-based approach.
+ - **Not based on hashing or salting**, but rather an encryption/decryption method with a symmetric key.
+- **Secrets Manager Support**:
+ - Users can **avoid storing credentials** in OpenMetadata by configuring an external **Secrets Manager**.
+ - More details on setting up a Secrets Manager can be found here:
+ 🔗 [Secrets Manager Documentation](https://docs.open-metadata.org/latest/deployment/secrets-manager)
+
+## Secure Connections to Data Sources
+
+OpenMetadata supports **encrypted connections** to various databases and services.
+
+- **SSL/TLS Support**:
+ - OpenMetadata allows users to configure **SSL/TLS encryption** for secure data transmission.
+ - Users can specify **SSL modes** and provide **CA certificates** for SSL validation.
+- **How to Enable SSL?**
+ - Each connector supports different SSL configurations.
+ - Follow the detailed guide for enabling SSL in OpenMetadata:
+ 🔗 [Enable SSL in OpenMetadata](https://docs.open-metadata.org/latest/deployment/security/enable-ssl)
+
+## **Additional Security Measures**
+
+- **Role-Based Access Control (RBAC)**: OpenMetadata allows administrators to define user roles and permissions.
+- **Authentication & Authorization**: OpenMetadata supports integration with OAuth, SAML, and LDAP for secure authentication.
+- **Data Access Control**: Users can restrict access to metadata based on policies and governance rules.
+
+{% note %}
+- **Passwords and secrets are securely encrypted** using **Fernet encryption**.
+- **Connections to data sources can be encrypted** using **SSL/TLS**.
+- **Secrets Managers** can be used to manage credentials externally.
+{% /note %}
diff --git a/openmetadata-docs/content/v1.7.x-SNAPSHOT/getting-started/day-1/index.md b/openmetadata-docs/content/v1.7.x-SNAPSHOT/getting-started/day-1/index.md
index af90c55cbc86..a92ab62dbbba 100644
--- a/openmetadata-docs/content/v1.7.x-SNAPSHOT/getting-started/day-1/index.md
+++ b/openmetadata-docs/content/v1.7.x-SNAPSHOT/getting-started/day-1/index.md
@@ -8,10 +8,10 @@ collate: true
Get started with your Collate service in just few simple steps:
-1. Set up a Data Connector: Connect your data sources to begin collecting metadata.
-2. Ingest Metadata: Run the metadata ingestion to gather and push data insights.
-3. Invite Users: Add team members to collaborate and manage metadata together.
-4. Explore the Features: Dive into Collate's rich feature set to unlock the full potential of your data.
+1. **Set up a Data Connector**: Connect your data sources to begin collecting metadata.
+2. **Ingest Metadata**: Run the metadata ingestion to gather and push data insights.
+3. **Invite Users**: Add team members to collaborate and manage metadata together.
+4. **Explore the Features**: Dive into Collate's rich feature set to unlock the full potential of your data.
**Ready to begin? Let's get started!**
@@ -24,7 +24,7 @@ with links to more detailed documentation.
## Step 1: Set up a Data Connector
Once you’re able to login to your Collate instance, set up a data connector to start bringing metadata into Collate.
-There are [80+ turnkey connectors](/connectors) to various services: data warehouses, data lakes, databases, dashboards,
+There are [90+ turnkey connectors](/connectors) to various services: data warehouses, data lakes, databases, dashboards,
messaging services, pipelines, ML models, storage services, and other Metadata Services.
Connections to [custom data sources](/connectors/custom-connectors) can also be created via API.
diff --git a/openmetadata-docs/content/v1.7.x-SNAPSHOT/getting-started/index.md b/openmetadata-docs/content/v1.7.x-SNAPSHOT/getting-started/index.md
index 4de859fbebf7..b3a0c941ecb6 100644
--- a/openmetadata-docs/content/v1.7.x-SNAPSHOT/getting-started/index.md
+++ b/openmetadata-docs/content/v1.7.x-SNAPSHOT/getting-started/index.md
@@ -7,7 +7,7 @@ collate: true
# Getting Started
Welcome to Collate's unified platform for data discovery, observability, and governance! Our platform centralizes all
-the context around your data to help you build high-quality and AI assets. This guide gives you all the information you
+the context around your data to help you build high-quality data and AI assets. This guide gives you all the information you
need to set up your Collate environment in 30 minutes.
## How Does Collate Work?
@@ -15,14 +15,14 @@ need to set up your Collate environment in 30 minutes.
Collate is designed for both technical and non-technical data practitioners to work together across a broad set of use cases,
including data discovery, lineage, observability, quality, collaboration, governance, and insights.
-A library of 80+ turnkey connectors is available to easily ingest metadata into Collate, such as data warehouses, data lakes,
+A library of 90+ turnkey connectors is available to easily ingest metadata into Collate, such as data warehouses, data lakes,
streaming, dashboards, ML models, and more. APIs are also available to easily ingest metadata from custom data sources.
Metadata from these different sources is organized into a Unified Metadata Graph, which provides a single, comprehensive
source of truth across your entire data estate.
This centralized information is surfaced through a unified user interface for all your use cases so that different data
practitioners no longer need to switch between different data catalogs, quality, or governance tools. Additionally,
-Collate can be extended through the application ecosystem, such as with AI productivity applications like MetaPilot,
+Collate can be extended through the application ecosystem, such as with AI productivity applications like Collate AI,
or with customer-built workflows to integrate Collate with your existing systems. These capabilities are built around
native collaboration capabilities for shared workflows across different teams so that every data practitioner can work
together: data platform, data governance, data scientist/analyst, and business user.
@@ -31,7 +31,7 @@ together: data platform, data governance, data scientist/analyst, and business u
Before we get started, here’s a quick summary of some of Collate’s main features:
-- **Discovery**: integrated catalog, quality, and glossary; natural language search, filtering, and faceting, 80+ turnkey data connectors, and MetaPilot AI Chatbot.
+- **Discovery**: integrated catalog, quality, and glossary; natural language search, filtering, and faceting, 90+ turnkey data connectors, and Collate AI Chatbot.
- **Lineage**: table and column-level lineage, automated data estate mapping and APIs, lineage layers and search, governance and PII automation and manual customization.
- **Observability**: alerting and notifications, incident management, third-party notifications, pipeline monitoring, root cause analysis, anomaly detection, data profiler.
- **Quality**: table and column test cases, no-code and SQL data quality tests, test suites, test case reporting, quality dashboards, widgets and data quality lineage maps.
diff --git a/openmetadata-docs/content/v1.7.x-SNAPSHOT/how-to-guides/admin-guide/roles-policies/authorization.md b/openmetadata-docs/content/v1.7.x-SNAPSHOT/how-to-guides/admin-guide/roles-policies/authorization.md
index 936756f1912f..15133572cda0 100644
--- a/openmetadata-docs/content/v1.7.x-SNAPSHOT/how-to-guides/admin-guide/roles-policies/authorization.md
+++ b/openmetadata-docs/content/v1.7.x-SNAPSHOT/how-to-guides/admin-guide/roles-policies/authorization.md
@@ -34,6 +34,7 @@ Here are some examples of conditions.
| **matchAllTags(tagFqn, [tagFqn…])** | Returns true if the resource has all the tags from the tag list. |
| **matchAnyTag(tagFqn, [tagFqn…])** | Returns true if the resource has any of the tags from the tag list. |
| **matchTeam()** | Returns true if the user belongs to the team that owns the resource. |
+| **hasDomain()** | Returns true if the logged in user is the has domain access of the entity being accessed |
Conditions are used to assess DataAsset like Tables/Topics/Dashboards etc.. for specific attributes.
diff --git a/openmetadata-docs/content/v1.7.x-SNAPSHOT/how-to-guides/data-discovery/troubleshooting.md b/openmetadata-docs/content/v1.7.x-SNAPSHOT/how-to-guides/data-discovery/troubleshooting.md
new file mode 100644
index 000000000000..5122d4e01d32
--- /dev/null
+++ b/openmetadata-docs/content/v1.7.x-SNAPSHOT/how-to-guides/data-discovery/troubleshooting.md
@@ -0,0 +1,42 @@
+---
+title: Troubleshooting for Export issue
+slug: /how-to-guides/data-discovery/troubleshooting
+---
+
+# Troubleshooting Export Issue
+When attempting to export a **CSV file for a Glossary**, the process gets stuck on the message **"Export initiated successfully."** and never completes. The file is not downloaded, and the export button remains disabled.
+
+This issue may occur if **WebSockets are blocked** in your network setup due to a **proxy** or **load balancer** configuration. OpenMetadata relies on WebSockets for real-time communication, and if they are blocked, the export process cannot complete.
+
+## Troubleshooting Steps
+
+### Step 1: Check for Load Balancer or Proxy
+
+If your setup includes a **load balancer** or **proxy**, verify whether WebSockets are being blocked.
+
+1. Run the following API request to check the export status:
+
+```bash
+curl -X GET "https:///api/v1/glossaries/name//exportAsync"
+```
+
+If the response does not return a file and remains in an active state indefinitely, WebSockets might be blocked.
+
+### Step 2: Verify WebSocket Connectivity
+
+1. Open the Developer Tools in your browser (F12 or Ctrl + Shift + I in Chrome).
+2. Navigate to the Network tab.
+3. Filter requests by WebSockets (WS).
+4. Check if WebSocket requests to OpenMetadata (wss://) are blocked, failing, or not established.
+
+### Step 3: Adjust WebSocket Settings in Your Proxy
+
+If WebSockets are blocked, update your proxy configuration to allow WebSocket traffic.
+
+### Step 4: Restart Services and Verify
+
+1. Restart your proxy or load balancer after making the configuration changes.
+2. Clear browser cache and cookies.
+3. Retry the CSV export in OpenMetadata.
+
+Once WebSockets are enabled in the proxy settings, the glossary export should complete successfully, and the CSV file should be available for download.
diff --git a/openmetadata-docs/content/v1.7.x-SNAPSHOT/how-to-guides/data-governance/automation/index.md b/openmetadata-docs/content/v1.7.x-SNAPSHOT/how-to-guides/data-governance/automation/index.md
index fa90865335a0..85681472541f 100644
--- a/openmetadata-docs/content/v1.7.x-SNAPSHOT/how-to-guides/data-governance/automation/index.md
+++ b/openmetadata-docs/content/v1.7.x-SNAPSHOT/how-to-guides/data-governance/automation/index.md
@@ -23,10 +23,26 @@ Managing metadata manually can be challenging, particularly in dynamic environme
## Key Use Cases for Collate Automations
-### 1. Bulk Ownership and Domain Assignment
+### 1. Bulk Description
{% image
-src="/images/v1.7/how-to-guides/governance/bulk-ownership-and.png"
+src="/images/v1.6/how-to-guides/governance/automator-description.png"
+alt="Getting started with Automation"
+caption="Getting started with Automation"
+/%}
+
+- **Problem**: Many datasets lack descriptions, making it difficult for users to understand the data's purpose and contents. Sometimes, the same column description needs to be added to multiple datasets.
+- **Solution**: Automations can bulk-apply descriptions to tables and columns, ensuring that all data assets are consistently documented.
+- **Benefit**: This use case improves data discoverability and understanding, making it easier for users to find and use the data effectively.
+
+For the Action Configuration:
+- **Apply to Children**: Lets you apply the description to the selected child assets (e.g., columns) within an asset.
+- **Overwrite Metadata**: Allows you to overwrite existing descriptions with the new description. Otherwise, we will only apply the description to empty tables or columns.
+
+### 2. Bulk Ownership and Domain Assignment
+
+{% image
+src="/images/v1.6/how-to-guides/governance/bulk-ownership-and.png"
alt="Getting started with Automation"
caption="Getting started with Automation"
/%}
@@ -35,10 +51,13 @@ caption="Getting started with Automation"
- **Solution**: Automations can bulk-assign ownership and domains to datasets, ensuring all data assets are correctly categorized and owned. This process can be applied to tables, schemas, or other assets within Collate.
- **Benefit**: This use case ensures data assets have a designated owner and are organized under the appropriate domain, making data more discoverable and accountable.
-### 2. Bulk Tagging and Glossary Term Assignment
+For the Action Configuration:
+- **Overwrite Metadata**: Allows you to overwrite existing owner or domain with the configured one. Otherwise, we will only apply the owner or domain to assets that do not have an existing owner or domain.
+
+### 3. Bulk Tagging and Glossary Term Assignment
{% image
-src="/images/v1.7/how-to-guides/governance/bulk-tagging-glossary.png"
+src="/images/v1.6/how-to-guides/governance/bulk-tagging-glossary.png"
alt="Getting started with Automation"
caption="Getting started with Automation"
/%}
@@ -47,10 +66,15 @@ caption="Getting started with Automation"
- **Solution**: Automations allow users to bulk-apply tags (e.g., PII) or glossary terms (e.g., Customer ID) to specific datasets, ensuring uniformity across the platform.
- **Benefit**: This automation reduces the risk of missing important tags like PII-sensitive and ensures that key metadata elements are applied consistently across datasets.
-### 3. Metadata Propagation via Lineage
+For the Action Configuration:
+- **Apply to Children**: Lets you apply the Tags or Glossary Terms to the selected child assets (e.g., columns) within an asset.
+- **Overwrite Metadata**: Allows you to overwrite existing Tags or Terms with the configured one. Otherwise, we will add the new Tags or Terms to the existing ones.
+
+
+### 4. Metadata Propagation via Lineage
{% image
-src="/images/v1.7/how-to-guides/governance/metadata-propogation.png"
+src="/images/v1.6/how-to-guides/governance/metadata-propogation.png"
alt="Getting started with Automation"
caption="Getting started with Automation"
/%}
@@ -59,14 +83,35 @@ caption="Getting started with Automation"
- **Solution**: Use automations to propagate metadata across related datasets, ensuring that all relevant data inherits the correct metadata properties from the source dataset.
- **Benefit**: Metadata consistency is ensured across the entire data lineage, reducing the need for manual updates and maintaining a single source of truth.
-### 4. Automatic PII Detection and Tagging
+For the Action Configuration:
+1. First, we can choose if we want the propagation to happen at the Parent level (e.g., Table), Column Level, or both. This can be configured by selecting **Propagate Parent** and/or **Propagate Column Level**.
+2. Then, we can control which pieces of metadata we want to propagate via lineage:
+ - **Propagate Description**: Propagates the description from the source asset to the downstream assets. Works for both parent and column-level.
+ - **Propagate Tags**: Propagates the tags from the source asset to the downstream assets. Works for both parent and column-level.
+ - **Propagate Glossary Terms**: Propagates the glossary terms from the source asset to the downstream assets. Works for both parent and column-level.
+ - **Propagate Owners**: Only applicable for Parent assets. Propagates the owner information to downstream assets.
+ - **Propagate Tier**: Only applicable for Parent assets. Propagated the tier information to downstream assets.
+
+As with other actions, you can choose to **Overwrite Metadata** or keep the existing metadata and only apply the new metadata to assets that do not have the metadata already.
+
+
+### 5. Automatic PII Detection and Tagging
{% image
-src="/images/v1.7/how-to-guides/governance/automatic-detection.png"
+src="/images/v1.6/how-to-guides/governance/automatic-detection.png"
alt="Getting started with Automation"
caption="Getting started with Automation"
/%}
+{% note noteType="Warning" %}
+
+Note that we recommend using the **Auto Classification** workflow instead, which allows you to discover PII data automatically,
+even in cases where you don't want to ingest the Sample Data into Collate.
+
+Note that this automation, the ML Tagging, will be deprecated in future releases.
+
+{% /note %}
+
- **Problem**: Manually identifying and tagging Personally Identifiable Information (PII) across large datasets is labor-intensive and prone to errors.
- **Solution**: Automations can automatically detect PII data (e.g., emails, usernames) and apply relevant tags to ensure that sensitive data is flagged appropriately for compliance.
- **Benefit**: Ensures compliance with data protection regulations by consistently tagging sensitive data, reducing the risk of non-compliance.
diff --git a/openmetadata-docs/content/v1.7.x-SNAPSHOT/how-to-guides/data-governance/classification/Auto Classification/auto-pii-tagging.md b/openmetadata-docs/content/v1.7.x-SNAPSHOT/how-to-guides/data-governance/classification/Auto Classification/auto-pii-tagging.md
index 9bf30da68f38..c691a6062151 100644
--- a/openmetadata-docs/content/v1.7.x-SNAPSHOT/how-to-guides/data-governance/classification/Auto Classification/auto-pii-tagging.md
+++ b/openmetadata-docs/content/v1.7.x-SNAPSHOT/how-to-guides/data-governance/classification/Auto Classification/auto-pii-tagging.md
@@ -7,11 +7,6 @@ slug: /how-to-guides/data-governance/classification/auto/auto-pii-tagging
Auto PII tagging for Sensitive/NonSensitive at the column level is performed based on the two approaches described below.
-{% note %}
-PII Tagging is only available during `Profiler Ingestion`.
-{% /note %}
-
-
## Tagging logic
1. **Column Name Scanner**: We validate the column names of the table against a set of regex rules that help us identify
diff --git a/openmetadata-docs/content/v1.7.x-SNAPSHOT/how-to-guides/data-governance/classification/Auto Classification/external-workflow.md b/openmetadata-docs/content/v1.7.x-SNAPSHOT/how-to-guides/data-governance/classification/Auto Classification/external-workflow.md
index 2d82baac5abe..7c57f81bf8d8 100644
--- a/openmetadata-docs/content/v1.7.x-SNAPSHOT/how-to-guides/data-governance/classification/Auto Classification/external-workflow.md
+++ b/openmetadata-docs/content/v1.7.x-SNAPSHOT/how-to-guides/data-governance/classification/Auto Classification/external-workflow.md
@@ -42,6 +42,19 @@ The Auto Classification Workflow enables automatic tagging of sensitive informat
- When set to `true`, filtering patterns will be applied to the Fully Qualified Name of a table (e.g., `service_name.db_name.schema_name.table_name`).
- When set to `false`, filtering applies only to raw table names.
+## Auto Classification Workflow Execution
+
+To execute the **Auto Classification Workflow**, follow the steps below:
+
+### 1. Install the Required Python Package
+Ensure you have the correct OpenMetadata ingestion package installed, including the **PII Processor** module:
+
+```bash
+pip install "openmetadata-ingestion[pii-processor]"
+```
+## 2. Define and Execute the Python Workflow
+Instead of using a YAML configuration, use the AutoClassificationWorkflow from OpenMetadata to trigger the ingestion process programmatically.
+
## Sample Auto Classification Workflow yaml
```yaml
@@ -103,6 +116,14 @@ workflowConfig:
jwtToken: "eyJraWQiOiJHYjM4OWEtOWY3Ni1nZGpzLWE5MmotMDI0MmJrOTQzNTYiLCJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsImlzQm90IjpmYWxzZSwiaXNzIjoib3Blbi1tZXRhZGF0YS5vcmciLCJpYXQiOjE2NjM5Mzg0NjIsImVtYWlsIjoiYWRtaW5Ab3Blbm1ldGFkYXRhLm9yZyJ9.tS8um_5DKu7HgzGBzS1VTA5uUjKWOCU0B_j08WXBiEC0mr0zNREkqVfwFDD-d24HlNEbrqioLsBuFRiwIWKc1m_ZlVQbG7P36RUxhuv2vbSp80FKyNM-Tj93FDzq91jsyNmsQhyNv_fNr3TXfzzSPjHt8Go0FMMP66weoKMgW2PbXlhVKwEuXUHyakLLzewm9UMeQaEiRzhiTMU3UkLXcKbYEJJvfNFcLwSl9W8JCO_l0Yj3ud-qt_nQYEZwqW6u5nfdQllN133iikV4fM5QZsMCnm8Rq1mvLR0y9bmJiD7fwM1tmJ791TUWqmKaTnP49U493VanKpUAfzIiOiIbhg"
```
+### 3. Expected Outcome
+- Automatically classifies and tags sensitive data based on predefined patterns and confidence levels.
+- Improves metadata enrichment and enhances data governance practices.
+- Provides visibility into sensitive data across databases.
+This approach ensures that the Auto Classification Workflow is executed correctly using the appropriate OpenMetadata ingestion framework.
+
+{% partial file="/v1.7/connectors/yaml/auto-classification.md" variables={connector: "snowflake"} /%}
+
## Workflow Execution
### To Execute the Auto Classification Workflow:
diff --git a/openmetadata-docs/content/v1.7.x-SNAPSHOT/menu.md b/openmetadata-docs/content/v1.7.x-SNAPSHOT/menu.md
index f5833c56a26a..93f5de33997c 100644
--- a/openmetadata-docs/content/v1.7.x-SNAPSHOT/menu.md
+++ b/openmetadata-docs/content/v1.7.x-SNAPSHOT/menu.md
@@ -222,6 +222,9 @@ site_menu:
- category: Deployment / Metrics
url: /deployment/metrics
+
+ - category: Deployment / OSS Security
+ url: /deployment/oss-security
- category: Connectors
url: /connectors
@@ -807,6 +810,8 @@ site_menu:
url: /how-to-guides/data-discovery/details
- category: How-to Guides / Data Discovery / Add Complex Queries using Advanced Search
url: /how-to-guides/data-discovery/advanced
+ - category: How-to Guides / Data Discovery / Troubleshooting
+ url: /how-to-guides/data-discovery/troubleshooting
- category: How-to Guides / Data Discovery / OpenMetadata Chrome Extension
url: /how-to-guides/data-discovery/openmetadata-extension
- category: How-to Guides / Data Discovery / OpenMetadata Chrome Extension / Viewing Activity Feeds and Managing Tasks
diff --git a/openmetadata-docs/content/v1.7.x-SNAPSHOT/quick-start/getting-started/day-1/index.md b/openmetadata-docs/content/v1.7.x-SNAPSHOT/quick-start/getting-started/day-1/index.md
index 831688efda9b..dca07d407fdd 100644
--- a/openmetadata-docs/content/v1.7.x-SNAPSHOT/quick-start/getting-started/day-1/index.md
+++ b/openmetadata-docs/content/v1.7.x-SNAPSHOT/quick-start/getting-started/day-1/index.md
@@ -7,10 +7,10 @@ slug: /quick-start/getting-started/day-1
Get started with your OpenMetadata service in a few simple steps:
-1. Set up a Data Connector: Connect your data sources to begin collecting metadata.
-2. Ingest Metadata: Run the metadata ingestion process to gather and push data insights.
-3. Invite Users: Add team members to collaborate and manage metadata together.
-4. Explore the Features: Dive into OpenMetadata's extensive feature set to unlock the full potential of your data.
+1. **Set up a Data Connector**: Connect your data sources to begin collecting metadata.
+2. **Ingest Metadata**: Run the metadata ingestion process to gather and push data insights.
+3. **Invite Users**: Add team members to collaborate and manage metadata together.
+4. **Explore the Features**: Dive into OpenMetadata's extensive feature set to unlock the full potential of your data.
**Ready to begin? Let's get started!**
@@ -20,7 +20,7 @@ You should receive your initial OpenMetadata credentials from OpenMetadata suppo
## Step 1: Set up a Data Connector
-Once you have logged into your OpenMetadata instance, set up a data connector to start ingesting metadata. OpenMetadata provides [80+ turnkey connectors](/connectors) for a wide range of services, including:
+Once you have logged into your OpenMetadata instance, set up a data connector to start ingesting metadata. OpenMetadata provides [90+ turnkey connectors](/connectors) for a wide range of services, including:
- Databases
- Dashboards
diff --git a/openmetadata-docs/content/v1.7.x-SNAPSHOT/quick-start/getting-started/index.md b/openmetadata-docs/content/v1.7.x-SNAPSHOT/quick-start/getting-started/index.md
index 778794a85fd3..7130d1987f6a 100644
--- a/openmetadata-docs/content/v1.7.x-SNAPSHOT/quick-start/getting-started/index.md
+++ b/openmetadata-docs/content/v1.7.x-SNAPSHOT/quick-start/getting-started/index.md
@@ -11,9 +11,9 @@ Welcome to OpenMetadata's unified platform for data discovery, observability, an
OpenMetadata is designed to support both technical and non-technical data practitioners across various use cases, including data discovery, lineage, observability, quality, collaboration, governance, and insights.
-The platform includes a library of 80+ turnkey connectors to easily ingest metadata from sources such as data warehouses, data lakes, streaming platforms, dashboards, and ML models. For custom data sources, APIs are available to streamline metadata ingestion. Metadata from these sources is organized into a Unified Metadata Graph, providing a single, comprehensive source of truth for your entire data estate.
+The platform includes a library of 90+ turnkey connectors to easily ingest metadata from sources such as data warehouses, data lakes, streaming platforms, dashboards, and ML models. For custom data sources, APIs are available to streamline metadata ingestion. Metadata from these sources is organized into a Unified Metadata Graph, providing a single, comprehensive source of truth for your entire data estate.
-This centralized metadata is accessible through a unified user interface, eliminating the need for practitioners to switch between multiple catalogs, quality, or governance tools. OpenMetadata can also be extended with applications, such as AI-driven productivity tools like MetaPilot, or through custom-built workflows that integrate the platform with existing systems.
+This centralized metadata is accessible through a unified user interface, eliminating the need for practitioners to switch between multiple catalogs, quality, or governance tools. OpenMetadata can also be extended with applications, such as AI-driven productivity tools like Collate AI, or through custom-built workflows that integrate the platform with existing systems.
The platform’s native collaboration features support shared workflows, enabling different teams—data platform engineers, governance professionals, data scientists/analysts, and business users—to collaborate effectively in a single environment.
## Key Features of OpenMetadata
@@ -23,7 +23,7 @@ Before we get started, here’s a quick summary of some of OpenMetadata’s main
### Discovery
- Integrated catalog, data quality, and glossary
- Natural language search, filtering, and faceting
-- 80+ turnkey data connectors
+- 90+ turnkey data connectors
### Lineage
- Table and column-level lineage
diff --git a/openmetadata-docs/content/v1.7.x-SNAPSHOT/releases/releases/index.md b/openmetadata-docs/content/v1.7.x-SNAPSHOT/releases/releases/index.md
index 5636e4c0cd67..b69b062859da 100644
--- a/openmetadata-docs/content/v1.7.x-SNAPSHOT/releases/releases/index.md
+++ b/openmetadata-docs/content/v1.7.x-SNAPSHOT/releases/releases/index.md
@@ -8,7 +8,7 @@ slug: /releases/all-releases
{% note %}
The OpenMetadata community is on a monthly release cadence. At every 4-5 weeks we will be releasing a new
-version. To see what's coming in next releases, please check our [Roadmap](/releases/roadmap) section.
+version. To see what's coming in next releases, please check our {% collateContent %}[Roadmap](https://www.getcollate.io/roadmap){% /collateContent %}{% ossContent %}[Roadmap](/roadmap){% /ossContent %} section.
{% /note %}
@@ -168,7 +168,7 @@ ER diagrams help you better understand and manage your data architecture by show
Organizations often struggle with data governance due to rigid, pre-defined manual workflows. OpenMetadata 1.6 introduces a new, automated data governance framework designed to be customized to each organization's needs.
-In Collate 1.6, the Glossary Approval Workflow has been migrated to this new framework. Now, you can create custom approval processes with specific conditions and rules and easily visualize them through intuitive workflow diagrams. You can also create smart approval processes for glossary terms with real-time state changes and task creation to save time and streamline work.
+In Collate 1.6, the Glossary Approval Workflow has been migrated to this new framework. Now, you can create custom approval processes with specific conditions and rules and easily visualize them through intuitive workflow diagrams. You can also create smart approval processes for glossary terms with real-time state changes and task creation to save time and streamline work.
## Data Certification Workflows for Automated Bronze, Silver, & Gold Data Standardization! (Collate)
@@ -202,7 +202,7 @@ OpenMetadata 1.6 extends Role-Based Access Control (RBAC) to search functionalit
## Expanded Connector Ecosystem and Diversity
-OpenMetadata's ingestion framework contains 80+ native connectors. These connectors are the foundation of the platform and bring in all the metadata your team needs: technical metadata, lineage, usage, profiling, etc.
+OpenMetadata's ingestion framework contains 90+ native connectors. These connectors are the foundation of the platform and bring in all the metadata your team needs: technical metadata, lineage, usage, profiling, etc.
We bring new connectors in each release, continuously expanding our coverage. This time, release 1.6 comes with seven new connectors:
@@ -770,7 +770,7 @@ To continue pursuing this objective, the application was completely refactored t
## Ingestion Connectors
-80+ connectors to help teams to centralize metadata. We continue to push the boundaries of this mission, in
+90+ connectors to help teams to centralize metadata. We continue to push the boundaries of this mission, in
- **Apache Flink** as a Pipeline Connector
- **SAP ERP**, after a long and successful collaboration with our community and SAP experts
diff --git a/openmetadata-docs/images/connectors/synapse.webp b/openmetadata-docs/images/connectors/synapse.webp
new file mode 100644
index 000000000000..36197c9efe3a
Binary files /dev/null and b/openmetadata-docs/images/connectors/synapse.webp differ
diff --git a/openmetadata-docs/images/v1.6/connectors/adls/add-new-service.png b/openmetadata-docs/images/v1.6/connectors/adls/add-new-service.png
new file mode 100644
index 000000000000..b9c8c2d7681a
Binary files /dev/null and b/openmetadata-docs/images/v1.6/connectors/adls/add-new-service.png differ
diff --git a/openmetadata-docs/images/v1.6/connectors/adls/select-service.png b/openmetadata-docs/images/v1.6/connectors/adls/select-service.png
new file mode 100644
index 000000000000..7f29fdc0818e
Binary files /dev/null and b/openmetadata-docs/images/v1.6/connectors/adls/select-service.png differ
diff --git a/openmetadata-docs/images/v1.6/connectors/adls/service-connection.png b/openmetadata-docs/images/v1.6/connectors/adls/service-connection.png
new file mode 100644
index 000000000000..9e175a777333
Binary files /dev/null and b/openmetadata-docs/images/v1.6/connectors/adls/service-connection.png differ
diff --git a/openmetadata-docs/images/v1.6/deployment/upgrade/running-state-in-openmetadata.png b/openmetadata-docs/images/v1.6/deployment/upgrade/running-state-in-openmetadata.png
new file mode 100644
index 000000000000..cd808c4d92a8
Binary files /dev/null and b/openmetadata-docs/images/v1.6/deployment/upgrade/running-state-in-openmetadata.png differ
diff --git a/openmetadata-docs/images/v1.6/how-to-guides/governance/automator-description.png b/openmetadata-docs/images/v1.6/how-to-guides/governance/automator-description.png
new file mode 100644
index 000000000000..edd397fc9521
Binary files /dev/null and b/openmetadata-docs/images/v1.6/how-to-guides/governance/automator-description.png differ
diff --git a/openmetadata-docs/images/v1.7/connectors/adls/add-new-service.png b/openmetadata-docs/images/v1.7/connectors/adls/add-new-service.png
new file mode 100644
index 000000000000..b9c8c2d7681a
Binary files /dev/null and b/openmetadata-docs/images/v1.7/connectors/adls/add-new-service.png differ
diff --git a/openmetadata-docs/images/v1.7/connectors/adls/select-service.png b/openmetadata-docs/images/v1.7/connectors/adls/select-service.png
new file mode 100644
index 000000000000..7f29fdc0818e
Binary files /dev/null and b/openmetadata-docs/images/v1.7/connectors/adls/select-service.png differ
diff --git a/openmetadata-docs/images/v1.7/connectors/adls/service-connection.png b/openmetadata-docs/images/v1.7/connectors/adls/service-connection.png
new file mode 100644
index 000000000000..9e175a777333
Binary files /dev/null and b/openmetadata-docs/images/v1.7/connectors/adls/service-connection.png differ
diff --git a/openmetadata-docs/images/v1.7/deployment/upgrade/running-state-in-openmetadata.png b/openmetadata-docs/images/v1.7/deployment/upgrade/running-state-in-openmetadata.png
new file mode 100644
index 000000000000..cd808c4d92a8
Binary files /dev/null and b/openmetadata-docs/images/v1.7/deployment/upgrade/running-state-in-openmetadata.png differ
diff --git a/openmetadata-docs/images/v1.7/how-to-guides/governance/automator-description.png b/openmetadata-docs/images/v1.7/how-to-guides/governance/automator-description.png
new file mode 100644
index 000000000000..edd397fc9521
Binary files /dev/null and b/openmetadata-docs/images/v1.7/how-to-guides/governance/automator-description.png differ
diff --git a/openmetadata-service/pom.xml b/openmetadata-service/pom.xml
index 11b04594f47a..64c6aefcc52c 100644
--- a/openmetadata-service/pom.xml
+++ b/openmetadata-service/pom.xml
@@ -16,7 +16,7 @@
${project.basedir}/target/site/jacoco-aggregate/jacoco.xml${project.basedir}/src/test/java1.20.3
- 2.29.15
+ 2.30.191.14.04.9.01.0.0
@@ -28,6 +28,7 @@
3.6.03.3.12.1.1
+ 2.5.2
@@ -89,7 +90,7 @@
net.minidevjson-smart
- 2.5.1
+ ${json-smart.version}org.open-metadata
diff --git a/openmetadata-service/src/main/java/org/openmetadata/csv/EntityCsv.java b/openmetadata-service/src/main/java/org/openmetadata/csv/EntityCsv.java
index 0121dbdc6d07..f13ae7b53a5c 100644
--- a/openmetadata-service/src/main/java/org/openmetadata/csv/EntityCsv.java
+++ b/openmetadata-service/src/main/java/org/openmetadata/csv/EntityCsv.java
@@ -50,6 +50,7 @@
import java.util.Optional;
import java.util.Set;
import java.util.UUID;
+import java.util.function.Function;
import javax.ws.rs.core.Response;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.csv.CSVFormat;
@@ -193,7 +194,11 @@ public final void addRecord(CsvFile csvFile, List recordList) {
}
/** Owner field is in entityType:entityName format */
- public List getOwners(CSVPrinter printer, CSVRecord csvRecord, int fieldNumber)
+ public List getOwners(
+ CSVPrinter printer,
+ CSVRecord csvRecord,
+ int fieldNumber,
+ Function invalidMessageCreator)
throws IOException {
if (!processRecord) {
return null;
@@ -207,7 +212,7 @@ public List getOwners(CSVPrinter printer, CSVRecord csvRecord,
for (String owner : owners) {
List ownerTypes = listOrEmpty(CsvUtil.fieldToEntities(owner));
if (ownerTypes.size() != 2) {
- importFailure(printer, invalidOwner(fieldNumber), csvRecord);
+ importFailure(printer, invalidMessageCreator.apply(fieldNumber), csvRecord);
return Collections.emptyList();
}
EntityReference ownerRef =
@@ -219,6 +224,16 @@ public List getOwners(CSVPrinter printer, CSVRecord csvRecord,
return refs.isEmpty() ? null : refs;
}
+ public List getOwners(CSVPrinter printer, CSVRecord csvRecord, int fieldNumber)
+ throws IOException {
+ return getOwners(printer, csvRecord, fieldNumber, EntityCsv::invalidOwner);
+ }
+
+ public List getReviewers(
+ CSVPrinter printer, CSVRecord csvRecord, int fieldNumber) throws IOException {
+ return getOwners(printer, csvRecord, fieldNumber, EntityCsv::invalidReviewer);
+ }
+
/** Owner field is in entityName format */
public EntityReference getOwnerAsUser(CSVPrinter printer, CSVRecord csvRecord, int fieldNumber)
throws IOException {
@@ -868,6 +883,11 @@ public static String invalidOwner(int field) {
return String.format(FIELD_ERROR_MSG, CsvErrorType.INVALID_FIELD, field + 1, error);
}
+ public static String invalidReviewer(int field) {
+ String error = "Reviewer should be of format user:userName or team:teamName";
+ return String.format(FIELD_ERROR_MSG, CsvErrorType.INVALID_FIELD, field + 1, error);
+ }
+
public static String invalidExtension(int field, String key, String value) {
String error =
"Invalid key-value pair in extension string: Key = "
diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/TypeRegistry.java b/openmetadata-service/src/main/java/org/openmetadata/service/TypeRegistry.java
index a294a79cc51b..78c26a5668c5 100644
--- a/openmetadata-service/src/main/java/org/openmetadata/service/TypeRegistry.java
+++ b/openmetadata-service/src/main/java/org/openmetadata/service/TypeRegistry.java
@@ -14,15 +14,22 @@
package org.openmetadata.service;
import static org.openmetadata.common.utils.CommonUtil.listOrEmpty;
+import static org.openmetadata.service.Entity.ADMIN_USER_NAME;
+import static org.openmetadata.service.resources.types.TypeResource.PROPERTIES_FIELD;
import com.networknt.schema.JsonSchema;
+import java.util.List;
import java.util.Map;
+import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import lombok.extern.slf4j.Slf4j;
import org.openmetadata.schema.entity.Type;
+import org.openmetadata.schema.entity.type.Category;
import org.openmetadata.schema.entity.type.CustomProperty;
import org.openmetadata.service.exception.CatalogExceptionMessage;
import org.openmetadata.service.exception.EntityNotFoundException;
+import org.openmetadata.service.jdbi3.TypeRepository;
+import org.openmetadata.service.util.EntityUtil;
import org.openmetadata.service.util.FullyQualifiedName;
import org.openmetadata.service.util.JsonUtils;
@@ -49,6 +56,35 @@ public static TypeRegistry instance() {
return INSTANCE;
}
+ public final void initialize(TypeRepository repository) {
+ // Load types defined in OpenMetadata schemas
+ long now = System.currentTimeMillis();
+ List types = JsonUtils.getTypes();
+ types.forEach(
+ type -> {
+ type.withId(UUID.randomUUID()).withUpdatedBy(ADMIN_USER_NAME).withUpdatedAt(now);
+ LOG.debug("Loading type {}", type.getName());
+ try {
+ EntityUtil.Fields fields = repository.getFields(PROPERTIES_FIELD);
+ try {
+ Type storedType = repository.getByName(null, type.getName(), fields);
+ type.setId(storedType.getId());
+ // If entity type already exists, then carry forward custom properties
+ if (storedType.getCategory().equals(Category.Entity)) {
+ type.setCustomProperties(storedType.getCustomProperties());
+ }
+ } catch (Exception e) {
+ LOG.debug(
+ "Type '{}' not found. Proceeding to add new type entity in database.",
+ type.getName());
+ }
+ repository.addToRegistry(type);
+ } catch (Exception e) {
+ LOG.error("Error loading type {}", type.getName(), e);
+ }
+ });
+ }
+
public void addType(Type type) {
TYPES.put(type.getName(), type);
@@ -111,34 +147,25 @@ public static String getPropertyName(String propertyFQN) {
}
public static String getCustomPropertyType(String entityType, String propertyName) {
- Type type = TypeRegistry.TYPES.get(entityType);
- if (type != null && type.getCustomProperties() != null) {
- for (CustomProperty property : type.getCustomProperties()) {
- if (property.getName().equals(propertyName)) {
- return property.getPropertyType().getName();
- }
- }
+ String fqn = getCustomPropertyFQN(entityType, propertyName);
+ CustomProperty property = CUSTOM_PROPERTIES.get(fqn);
+ if (property == null) {
+ throw EntityNotFoundException.byMessage(
+ CatalogExceptionMessage.entityNotFound(propertyName, entityType));
}
- throw EntityNotFoundException.byMessage(
- CatalogExceptionMessage.entityNotFound(Entity.TYPE, String.valueOf(type)));
+ return property.getPropertyType().getName();
}
public static String getCustomPropertyConfig(String entityType, String propertyName) {
- Type type = TypeRegistry.TYPES.get(entityType);
- if (type != null && type.getCustomProperties() != null) {
- for (CustomProperty property : type.getCustomProperties()) {
- if (property.getName().equals(propertyName)
- && property.getCustomPropertyConfig() != null
- && property.getCustomPropertyConfig().getConfig() != null) {
- Object config = property.getCustomPropertyConfig().getConfig();
- if (config instanceof String || config instanceof Integer) {
- return config.toString(); // for simple type config return as string
- } else {
- return JsonUtils.pojoToJson(
- config); // for complex object in config return as JSON string
- }
- }
- }
+ String fqn = getCustomPropertyFQN(entityType, propertyName);
+ CustomProperty property = CUSTOM_PROPERTIES.get(fqn);
+ if (property != null
+ && property.getCustomPropertyConfig() != null
+ && property.getCustomPropertyConfig().getConfig() != null) {
+ Object config = property.getCustomPropertyConfig().getConfig();
+ return (config instanceof String || config instanceof Integer)
+ ? config.toString() // for simple type config return as string
+ : JsonUtils.pojoToJson(config); // for complex object in config return as JSON string
}
return null;
}
diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/changeEvent/AbstractEventConsumer.java b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/changeEvent/AbstractEventConsumer.java
index a0a11f031205..7db4c4e4f89c 100644
--- a/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/changeEvent/AbstractEventConsumer.java
+++ b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/changeEvent/AbstractEventConsumer.java
@@ -32,10 +32,12 @@
import org.openmetadata.schema.entity.events.EventSubscriptionOffset;
import org.openmetadata.schema.entity.events.FailedEvent;
import org.openmetadata.schema.entity.events.SubscriptionDestination;
+import org.openmetadata.schema.system.EntityError;
import org.openmetadata.schema.type.ChangeEvent;
import org.openmetadata.service.Entity;
import org.openmetadata.service.events.errors.EventPublisherException;
import org.openmetadata.service.util.JsonUtils;
+import org.openmetadata.service.util.ResultList;
import org.quartz.DisallowConcurrentExecution;
import org.quartz.Job;
import org.quartz.JobDetail;
@@ -70,8 +72,9 @@ private void init(JobExecutionContext context) {
(EventSubscription) context.getJobDetail().getJobDataMap().get(ALERT_INFO_KEY);
this.jobDetail = context.getJobDetail();
this.eventSubscription = sub;
- this.offset = loadInitialOffset(context).getCurrentOffset();
- this.startingOffset = loadInitialOffset(context).getStartingOffset();
+ EventSubscriptionOffset eventSubscriptionOffset = loadInitialOffset(context);
+ this.offset = eventSubscriptionOffset.getCurrentOffset();
+ this.startingOffset = eventSubscriptionOffset.getStartingOffset();
this.alertMetrics = loadInitialMetrics();
this.destinationMap = loadDestinationsMap(context);
this.doInit(context);
@@ -240,34 +243,46 @@ public void commit(JobExecutionContext jobExecutionContext) {
}
@Override
- public List pollEvents(long offset, long batchSize) {
- // Read from Change Event Table
+ public ResultList pollEvents(long offset, long batchSize) {
List eventJson = Entity.getCollectionDAO().changeEventDAO().list(batchSize, offset);
-
List changeEvents = new ArrayList<>();
+ List errorEvents = new ArrayList<>();
for (String json : eventJson) {
- ChangeEvent event = JsonUtils.readValue(json, ChangeEvent.class);
- changeEvents.add(event);
+ try {
+ ChangeEvent event = JsonUtils.readValue(json, ChangeEvent.class);
+ changeEvents.add(event);
+ } catch (Exception ex) {
+ errorEvents.add(new EntityError().withMessage(ex.getMessage()).withEntity(json));
+ LOG.error("Error in Parsing Change Event : {} , Message: {} ", json, ex.getMessage(), ex);
+ }
}
- return changeEvents;
+ return new ResultList<>(changeEvents, errorEvents, null, null, eventJson.size());
}
@Override
public void execute(JobExecutionContext jobExecutionContext) {
// Must Have , Before Execute the Init, Quartz Requires a Non-Arg Constructor
this.init(jobExecutionContext);
- // Poll Events from Change Event Table
- List batch = pollEvents(offset, eventSubscription.getBatchSize());
- int batchSize = batch.size();
- Map> eventsWithReceivers = createEventsWithReceivers(batch);
+ long batchSize = 0;
+ Map> eventsWithReceivers = new HashMap<>();
try {
+ // Poll Events from Change Event Table
+ ResultList batch = pollEvents(offset, eventSubscription.getBatchSize());
+ batchSize = batch.getPaging().getTotal();
+ eventsWithReceivers.putAll(createEventsWithReceivers(batch.getData()));
// Publish Events
if (!eventsWithReceivers.isEmpty()) {
alertMetrics.withTotalEvents(alertMetrics.getTotalEvents() + eventsWithReceivers.size());
publishEvents(eventsWithReceivers);
}
} catch (Exception e) {
- LOG.error("Error in executing the Job : {} ", e.getMessage());
+ LOG.error(
+ "Error in polling events for alert : {} , Offset : {} , Batch Size : {} ",
+ e.getMessage(),
+ offset,
+ batchSize,
+ e);
+
} finally {
if (!eventsWithReceivers.isEmpty()) {
// Commit the Offset
diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/changeEvent/Consumer.java b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/changeEvent/Consumer.java
index 9f1c8482062a..0d1be623ce08 100644
--- a/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/changeEvent/Consumer.java
+++ b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/changeEvent/Consumer.java
@@ -13,16 +13,16 @@
package org.openmetadata.service.apps.bundles.changeEvent;
-import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import org.openmetadata.schema.type.ChangeEvent;
import org.openmetadata.service.events.errors.EventPublisherException;
+import org.openmetadata.service.util.ResultList;
import org.quartz.JobExecutionContext;
public interface Consumer {
- List pollEvents(long offset, long batchSize);
+ ResultList pollEvents(long offset, long batchSize);
void publishEvents(Map> events);
diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/insights/workflows/dataAssets/processors/DataInsightsEntityEnricherProcessor.java b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/insights/workflows/dataAssets/processors/DataInsightsEntityEnricherProcessor.java
index 840b7694fc5f..210aaf38bdbe 100644
--- a/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/insights/workflows/dataAssets/processors/DataInsightsEntityEnricherProcessor.java
+++ b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/insights/workflows/dataAssets/processors/DataInsightsEntityEnricherProcessor.java
@@ -4,6 +4,7 @@
import static org.openmetadata.service.apps.bundles.insights.utils.TimestampUtils.END_TIMESTAMP_KEY;
import static org.openmetadata.service.apps.bundles.insights.utils.TimestampUtils.START_TIMESTAMP_KEY;
import static org.openmetadata.service.apps.bundles.insights.workflows.dataAssets.DataAssetsWorkflow.ENTITY_TYPE_FIELDS_KEY;
+import static org.openmetadata.service.search.SearchIndexUtils.parseFollowers;
import static org.openmetadata.service.workflows.searchIndex.ReindexingUtil.ENTITY_TYPE_KEY;
import static org.openmetadata.service.workflows.searchIndex.ReindexingUtil.TIMESTAMP_KEY;
import static org.openmetadata.service.workflows.searchIndex.ReindexingUtil.getUpdatedStats;
@@ -231,10 +232,8 @@ private Map enrichEntity(
oCustomProperties.ifPresent(
o -> entityMap.put(String.format("%sCustomProperty", entityType), o));
- // Remove 'changeDescription' field
- entityMap.remove("changeDescription");
- // Remove 'sampleData'
- entityMap.remove("sampleData");
+ // Parse Followers:
+ entityMap.put("followers", parseFollowers(entity.getFollowers()));
return entityMap;
}
diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/apps/scheduler/AbstractOmAppJobListener.java b/openmetadata-service/src/main/java/org/openmetadata/service/apps/scheduler/AbstractOmAppJobListener.java
index 8d256f4167fa..d110da423fca 100644
--- a/openmetadata-service/src/main/java/org/openmetadata/service/apps/scheduler/AbstractOmAppJobListener.java
+++ b/openmetadata-service/src/main/java/org/openmetadata/service/apps/scheduler/AbstractOmAppJobListener.java
@@ -61,7 +61,8 @@ public void jobToBeExecuted(JobExecutionContext jobExecutionContext) {
.withTimestamp(jobStartTime)
.withRunType(runType)
.withStatus(AppRunRecord.Status.RUNNING)
- .withScheduleInfo(jobApp.getAppSchedule());
+ .withScheduleInfo(jobApp.getAppSchedule())
+ .withConfig(JsonUtils.getMap(jobApp.getAppConfiguration()));
boolean update = false;
if (jobExecutionContext.isRecovering()) {
diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/apps/scheduler/AppScheduler.java b/openmetadata-service/src/main/java/org/openmetadata/service/apps/scheduler/AppScheduler.java
index c9db40ee835d..50c1fee40983 100644
--- a/openmetadata-service/src/main/java/org/openmetadata/service/apps/scheduler/AppScheduler.java
+++ b/openmetadata-service/src/main/java/org/openmetadata/service/apps/scheduler/AppScheduler.java
@@ -12,6 +12,9 @@
import java.util.List;
import java.util.Map;
import java.util.Properties;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.TimeUnit;
import lombok.Getter;
import lombok.extern.slf4j.Slf4j;
import org.openmetadata.common.utils.CommonUtil;
@@ -94,12 +97,17 @@ private AppScheduler(
.getListenerManager()
.addJobListener(new OmAppJobListener(dao), jobGroupEquals(APPS_JOB_GROUP));
- this.resetErrorTriggers();
+ ScheduledExecutorService threadScheduler = Executors.newScheduledThreadPool(1);
+ threadScheduler.scheduleAtFixedRate(this::resetErrorTriggers, 0, 24, TimeUnit.HOURS);
// Start Scheduler
this.scheduler.start();
}
+ /* Quartz triggers can go into an "ERROR" state in some cases. Most notably when the jobs
+ constructor throws an error. I do not know why this happens and the issues seem to be transient.
+ This method resets all triggers in the ERROR state to the normal state.
+ */
private void resetErrorTriggers() {
try {
scheduler
diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/exception/EventSubscriptionJobException.java b/openmetadata-service/src/main/java/org/openmetadata/service/exception/EventSubscriptionJobException.java
index 6944de845734..caa0f6fa13d9 100644
--- a/openmetadata-service/src/main/java/org/openmetadata/service/exception/EventSubscriptionJobException.java
+++ b/openmetadata-service/src/main/java/org/openmetadata/service/exception/EventSubscriptionJobException.java
@@ -5,6 +5,10 @@ public EventSubscriptionJobException(String message) {
super(message);
}
+ public EventSubscriptionJobException(String message, Throwable throwable) {
+ super(message, throwable);
+ }
+
public EventSubscriptionJobException(Throwable throwable) {
super(throwable);
}
diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/governance/workflows/Workflow.java b/openmetadata-service/src/main/java/org/openmetadata/service/governance/workflows/Workflow.java
index 7b6bb3111590..d65fa9b503ca 100644
--- a/openmetadata-service/src/main/java/org/openmetadata/service/governance/workflows/Workflow.java
+++ b/openmetadata-service/src/main/java/org/openmetadata/service/governance/workflows/Workflow.java
@@ -7,15 +7,16 @@
@Getter
public class Workflow {
+ public static final String INGESTION_PIPELINE_ID_VARIABLE = "ingestionPipelineId";
public static final String RELATED_ENTITY_VARIABLE = "relatedEntity";
- public static final String PAYLOAD = "payload";
public static final String RESULT_VARIABLE = "result";
- public static final String RESOLVED_BY_VARIABLE = "resolvedBy";
+ public static final String UPDATED_BY_VARIABLE = "updatedBy";
public static final String STAGE_INSTANCE_STATE_ID_VARIABLE = "stageInstanceStateId";
public static final String WORKFLOW_INSTANCE_EXECUTION_ID_VARIABLE =
"workflowInstanceExecutionId";
public static final String WORKFLOW_RUNTIME_EXCEPTION = "workflowRuntimeException";
public static final String EXCEPTION_VARIABLE = "exception";
+ public static final String GLOBAL_NAMESPACE = "global";
private final TriggerWorkflow triggerWorkflow;
private final MainWorkflow mainWorkflow;
diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/governance/workflows/WorkflowEventConsumer.java b/openmetadata-service/src/main/java/org/openmetadata/service/governance/workflows/WorkflowEventConsumer.java
index 0167230c8330..eaa3cde064a6 100644
--- a/openmetadata-service/src/main/java/org/openmetadata/service/governance/workflows/WorkflowEventConsumer.java
+++ b/openmetadata-service/src/main/java/org/openmetadata/service/governance/workflows/WorkflowEventConsumer.java
@@ -1,6 +1,9 @@
package org.openmetadata.service.governance.workflows;
import static org.openmetadata.schema.entity.events.SubscriptionDestination.SubscriptionType.GOVERNANCE_WORKFLOW_CHANGE_EVENT;
+import static org.openmetadata.service.governance.workflows.Workflow.GLOBAL_NAMESPACE;
+import static org.openmetadata.service.governance.workflows.Workflow.RELATED_ENTITY_VARIABLE;
+import static org.openmetadata.service.governance.workflows.WorkflowVariableHandler.getNamespacedVariableName;
import java.util.HashMap;
import java.util.List;
@@ -60,7 +63,9 @@ public void sendMessage(ChangeEvent event) throws EventPublisherException {
Map variables = new HashMap<>();
- variables.put("relatedEntity", entityLink.getLinkString());
+ variables.put(
+ getNamespacedVariableName(GLOBAL_NAMESPACE, RELATED_ENTITY_VARIABLE),
+ entityLink.getLinkString());
WorkflowHandler.getInstance().triggerWithSignal(signal, variables);
}
diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/governance/workflows/WorkflowHandler.java b/openmetadata-service/src/main/java/org/openmetadata/service/governance/workflows/WorkflowHandler.java
index 16287cd21fda..305375a31e72 100644
--- a/openmetadata-service/src/main/java/org/openmetadata/service/governance/workflows/WorkflowHandler.java
+++ b/openmetadata-service/src/main/java/org/openmetadata/service/governance/workflows/WorkflowHandler.java
@@ -1,17 +1,20 @@
package org.openmetadata.service.governance.workflows;
+import static org.openmetadata.service.governance.workflows.WorkflowVariableHandler.getNamespacedVariableName;
import static org.openmetadata.service.governance.workflows.elements.TriggerFactory.getTriggerWorkflowId;
import java.time.Duration;
import java.util.Arrays;
+import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.UUID;
+import lombok.Getter;
import lombok.extern.slf4j.Slf4j;
import org.flowable.bpmn.converter.BpmnXMLConverter;
-import org.flowable.common.engine.api.FlowableException;
import org.flowable.common.engine.api.FlowableObjectNotFoundException;
+import org.flowable.common.engine.impl.el.DefaultExpressionManager;
import org.flowable.engine.HistoryService;
import org.flowable.engine.ProcessEngine;
import org.flowable.engine.ProcessEngineConfiguration;
@@ -29,9 +32,11 @@
import org.openmetadata.schema.governance.workflows.WorkflowDefinition;
import org.openmetadata.service.Entity;
import org.openmetadata.service.OpenMetadataApplicationConfig;
+import org.openmetadata.service.clients.pipeline.PipelineServiceClientFactory;
import org.openmetadata.service.exception.UnhandledServerException;
import org.openmetadata.service.jdbi3.SystemRepository;
import org.openmetadata.service.jdbi3.locator.ConnectionType;
+import org.openmetadata.service.resources.services.ingestionpipelines.IngestionPipelineMapper;
@Slf4j
public class WorkflowHandler {
@@ -40,8 +45,9 @@ public class WorkflowHandler {
private RuntimeService runtimeService;
private TaskService taskService;
private HistoryService historyService;
+ private final Map