diff --git a/examples/notebooks/pdf-processing-1/README.md b/examples/notebooks/pdf-processing-1/README.md index c7fdf8ffb..043f37cde 100644 --- a/examples/notebooks/pdf-processing-1/README.md +++ b/examples/notebooks/pdf-processing-1/README.md @@ -37,9 +37,9 @@ PDF files are located in [examples/data-files/pdf-processing-1](../../data-files ## Running the code -[python version](pdf_processing_1_python.ipynb)   [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sujee/data-prep-kit/blob/process-pdf-1/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb) +[python version](pdf_processing_1_python.ipynb)   [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IBM/data-prep-kit/blob/dev/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb) -[ray version](pdf_processing_1_ray.ipynb)   [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sujee/data-prep-kit/blob/process-pdf-1/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb) +[ray version](pdf_processing_1_ray.ipynb)   [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IBM/data-prep-kit/blob/dev/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb) ## Troubleshooting diff --git a/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb b/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb index e6b4cb951..d8c1b9d90 100644 --- a/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb +++ b/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb @@ -9,6 +9,8 @@ "source": [ "# Processing PDFs using Data Prep Kit\n", "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IBM/data-prep-kit/blob/dev/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb)\n", + "\n", "This notebook will introduce DPK and showcase some of it's capabilities.\n", "\n", "Here is the workflow:\n", @@ -19,7 +21,7 @@ "- fuzzy dedupe : filter out 'near duplicates'\n", "- document quality: scoring documents for quality\n", "\n", - "![](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/images/data-prep-kit-3-workflow.png)\n" + "![](https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/pdf-processing-1/images/data-prep-kit-3-workflow.png)\n" ] }, { @@ -33,7 +35,7 @@ "\n", "Two options:\n", "\n", - "- **Option 1 - Google Colab:** easiest option. no setup required. Click this link to open this on google colab. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sujee/data-prep-kit/blob/process-pdf-1/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb)\n", + "- **Option 1 - Google Colab:** easiest option. no setup required. Click this link to open this on google colab. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IBM/data-prep-kit/blob/dev/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb)\n", "- **Option 2 - Local python dev environment:** Setup using this [guide](../../../README.md#-getting-started)\n", "\n", "The notebook will work as in both environments" @@ -322,11 +324,11 @@ "\n", "We will use simple PDFs. The files are [here](https://github.com/IBM/data-prep-kit/tree/dev/examples/notebooks/pdf-processing-1/)\n", "\n", - "- [earth.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/earth.pdf) and exact duplicate [earth-copy.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/earth-copy.pdf)\n", - "- [earth2.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/earth2.pdf) almost similar to earth.pdf (ONE word difference!)\n", - "- [mars.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/mars.pdf)\n", - "- [spam.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/spam.pdf) - contains spammy contents\n", - "- [lorem-ipsum.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/lorem-ipsum.pdf) - contains 'lorem ipsum' placeholder\n" + "- [earth.pdf](https://raw.githubusercontent.com/IBM/data-prep-kit/dev//examples/notebooks/pdf-processing-1/earth.pdf) and exact duplicate [earth-copy.pdf](https://raw.githubusercontent.com/IBM/data-prep-kit/dev//examples/notebooks/pdf-processing-1/earth-copy.pdf)\n", + "- [earth2.pdf](https://raw.githubusercontent.com/IBM/data-prep-kit/dev//examples/notebooks/pdf-processing-1/earth2.pdf) almost similar to earth.pdf (ONE word difference!)\n", + "- [mars.pdf](https://raw.githubusercontent.com/IBM/data-prep-kit/dev//examples/notebooks/pdf-processing-1/mars.pdf)\n", + "- [spam.pdf](https://raw.githubusercontent.com/IBM/data-prep-kit/dev//examples/notebooks/pdf-processing-1/spam.pdf) - contains spammy contents\n", + "- [lorem-ipsum.pdf](https://raw.githubusercontent.com/IBM/data-prep-kit/dev//examples/notebooks/pdf-processing-1/lorem-ipsum.pdf) - contains 'lorem ipsum' placeholder\n" ] }, { @@ -362,17 +364,17 @@ "source": [ "if RUNNING_IN_COLAB:\n", "\n", - " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/earth.pdf', os.path.join(input_dir, 'earth.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/IBM/data-prep-kit/dev//examples/data-files/pdf-processing-1/earth.pdf', os.path.join(input_dir, 'earth.pdf'))\n", "\n", - " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/earth-copy.pdf', os.path.join(input_dir, 'earth-copy.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/IBM/data-prep-kit/dev//examples/data-files/pdf-processing-1/earth-copy.pdf', os.path.join(input_dir, 'earth-copy.pdf'))\n", "\n", - " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/earth2.pdf', os.path.join(input_dir, 'earth2.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/IBM/data-prep-kit/dev//examples/data-files/pdf-processing-1/earth2.pdf', os.path.join(input_dir, 'earth2.pdf'))\n", "\n", - " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/mars.pdf', os.path.join(input_dir, 'mars.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/IBM/data-prep-kit/dev//examples/data-files/pdf-processing-1/mars.pdf', os.path.join(input_dir, 'mars.pdf'))\n", "\n", - " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/spam.pdf', os.path.join(input_dir, 'spam.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/IBM/data-prep-kit/dev//examples/data-files/pdf-processing-1/spam.pdf', os.path.join(input_dir, 'spam.pdf'))\n", "\n", - " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/lorem-ipsum.pdf', os.path.join(input_dir, 'lorem-ipsum.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/IBM/data-prep-kit/dev//examples/data-files/pdf-processing-1/lorem-ipsum.pdf', os.path.join(input_dir, 'lorem-ipsum.pdf'))\n", "else:\n", " print ('Using input files from : ', input_dir)" ] @@ -441,21 +443,21 @@ "name": "stderr", "output_type": "stream", "text": [ - "23:06:13 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 8}\n", - "23:06:13 INFO - pipeline id pipeline_id\n", - "23:06:13 INFO - code location None\n", - "23:06:13 INFO - data factory data_ is using local data access: input_folder - ../../data-files/pdf-processing-1/ output_folder - output/01_pdf2pq_out\n", - "23:06:13 INFO - data factory data_ max_files -1, n_sample -1\n", - "23:06:13 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", - "23:06:13 INFO - orchestrator pdf2parquet started at 2025-02-04 23:06:13\n", - "23:06:13 INFO - Number of files is 6, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.023715972900390625, 'total_file_size': 0.2709054946899414}\n", - "23:06:13 INFO - Initializing models\n" + "13:54:24 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 8}\n", + "13:54:24 INFO - pipeline id pipeline_id\n", + "13:54:24 INFO - code location None\n", + "13:54:24 INFO - data factory data_ is using local data access: input_folder - ../../data-files/pdf-processing-1/ output_folder - output/01_pdf2pq_out\n", + "13:54:24 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:54:24 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", + "13:54:24 INFO - orchestrator pdf2parquet started at 2025-02-06 13:54:24\n", + "13:54:24 INFO - Number of files is 6, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.023715972900390625, 'total_file_size': 0.2709054946899414}\n", + "13:54:24 INFO - Initializing models\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "21a1c68550c848cba79340080a1ccde4", + "model_id": "f1a499a391784b7ba00cb9b1730bac8d", "version_major": 2, "version_minor": 0 }, @@ -470,15 +472,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "23:06:18 INFO - Completed 1 files (16.67%) in 0.018 min\n", - "23:06:19 INFO - Completed 2 files (33.33%) in 0.033 min\n", - "23:06:19 INFO - Completed 3 files (50.0%) in 0.044 min\n", - "23:06:20 INFO - Completed 4 files (66.67%) in 0.055 min\n", - "23:06:21 INFO - Completed 5 files (83.33%) in 0.067 min\n", - "23:06:21 INFO - Completed 6 files (100.0%) in 0.078 min\n", - "23:06:21 INFO - Done processing 6 files, waiting for flush() completion.\n", - "23:06:21 INFO - done flushing in 0.0 sec\n", - "23:06:21 INFO - Completed execution in 0.141 min, execution result 0\n" + "13:54:29 INFO - Completed 1 files (16.67%) in 0.019 min\n", + "13:54:30 INFO - Completed 2 files (33.33%) in 0.034 min\n", + "13:54:31 INFO - Completed 3 files (50.0%) in 0.045 min\n", + "13:54:32 INFO - Completed 4 files (66.67%) in 0.056 min\n", + "13:54:32 INFO - Completed 5 files (83.33%) in 0.067 min\n", + "13:54:33 INFO - Completed 6 files (100.0%) in 0.077 min\n", + "13:54:33 INFO - Done processing 6 files, waiting for flush() completion.\n", + "13:54:33 INFO - done flushing in 0.0 sec\n", + "13:54:33 INFO - Completed execution in 0.143 min, execution result 0\n" ] }, { @@ -486,8 +488,8 @@ "output_type": "stream", "text": [ "✅ Stage:1 completed successfully\n", - "CPU times: user 21.5 s, sys: 1.94 s, total: 23.5 s\n", - "Wall time: 11.3 s\n" + "CPU times: user 21.2 s, sys: 2.29 s, total: 23.5 s\n", + "Wall time: 12.5 s\n" ] } ], @@ -588,13 +590,13 @@ " 1\n", " 0\n", " 2\n", - " 52b1cdf4-b1ef-4375-8e6b-23f174592c06\n", + " 4be2a61e-96f5-4f58-bf6f-e829dbdfa9d3\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 35\n", - " 2025-02-04T23:06:20.470544\n", - " 0.693593\n", + " 2025-02-06T13:54:32.155384\n", + " 0.651216\n", " lorem-ipsum.pdf\n", " \n", " \n", @@ -604,13 +606,13 @@ " 1\n", " 0\n", " 2\n", - " 854dca5d-9db5-4ea5-b2e5-bddd176bf1b8\n", + " 2bd06750-cb70-4689-b2b8-72913b929a1d\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-02-04T23:06:21.819893\n", - " 0.676735\n", + " 2025-02-06T13:54:33.440651\n", + " 0.617823\n", " spam.pdf\n", " \n", " \n", @@ -620,13 +622,13 @@ " 1\n", " 0\n", " 11\n", - " 875d0907-8dd3-4ef9-b3b0-a0083e7ad438\n", + " 594034db-1fcd-411b-a89e-d37e4defdfc2\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 610\n", - " 2025-02-04T23:06:19.774915\n", - " 0.641045\n", + " 2025-02-06T13:54:31.502460\n", + " 0.645348\n", " earth2.pdf\n", " \n", " \n", @@ -636,13 +638,13 @@ " 1\n", " 0\n", " 11\n", - " 6264e62a-0121-4cd4-8202-ea6e228e15f1\n", + " 20ae1424-c2c3-436f-a7ff-b8c69fa3a3c3\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-02-04T23:06:21.141230\n", - " 0.668992\n", + " 2025-02-06T13:54:32.821365\n", + " 0.664288\n", " mars.pdf\n", " \n", " \n", @@ -652,13 +654,13 @@ " 1\n", " 0\n", " 11\n", - " 582bc53b-96e2-4b09-8dd7-6a27a685a53e\n", + " 4b43fb09-c9ef-4d9a-af24-8e22b5ff33b3\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-02-04T23:06:18.199803\n", - " 1.053618\n", + " 2025-02-06T13:54:29.909555\n", + " 1.100482\n", " earth-copy.pdf\n", " \n", " \n", @@ -668,13 +670,13 @@ " 1\n", " 0\n", " 11\n", - " c6c18475-9365-4325-85dc-8acf6b969d8f\n", + " d1d30fbc-c1e9-4813-a067-085e50b4ee49\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-02-04T23:06:19.132090\n", - " 0.929218\n", + " 2025-02-06T13:54:30.855225\n", + " 0.931613\n", " earth.pdf\n", " \n", " \n", @@ -699,12 +701,12 @@ "5 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 52b1cdf4-b1ef-4375-8e6b-23f174592c06 6571294142213095721 pdf \n", - "1 854dca5d-9db5-4ea5-b2e5-bddd176bf1b8 10026122586747302274 pdf \n", - "2 875d0907-8dd3-4ef9-b3b0-a0083e7ad438 10729312978404042321 pdf \n", - "3 6264e62a-0121-4cd4-8202-ea6e228e15f1 7758129997476962679 pdf \n", - "4 582bc53b-96e2-4b09-8dd7-6a27a685a53e 14711865278795535908 pdf \n", - "5 c6c18475-9365-4325-85dc-8acf6b969d8f 14711865278795535908 pdf \n", + "0 4be2a61e-96f5-4f58-bf6f-e829dbdfa9d3 6571294142213095721 pdf \n", + "1 2bd06750-cb70-4689-b2b8-72913b929a1d 10026122586747302274 pdf \n", + "2 594034db-1fcd-411b-a89e-d37e4defdfc2 10729312978404042321 pdf \n", + "3 20ae1424-c2c3-436f-a7ff-b8c69fa3a3c3 7758129997476962679 pdf \n", + "4 4b43fb09-c9ef-4d9a-af24-8e22b5ff33b3 14711865278795535908 pdf \n", + "5 d1d30fbc-c1e9-4813-a067-085e50b4ee49 14711865278795535908 pdf \n", "\n", " hash size \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", @@ -715,12 +717,12 @@ "5 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", " date_acquired pdf_convert_time source_filename \n", - "0 2025-02-04T23:06:20.470544 0.693593 lorem-ipsum.pdf \n", - "1 2025-02-04T23:06:21.819893 0.676735 spam.pdf \n", - "2 2025-02-04T23:06:19.774915 0.641045 earth2.pdf \n", - "3 2025-02-04T23:06:21.141230 0.668992 mars.pdf \n", - "4 2025-02-04T23:06:18.199803 1.053618 earth-copy.pdf \n", - "5 2025-02-04T23:06:19.132090 0.929218 earth.pdf " + "0 2025-02-06T13:54:32.155384 0.651216 lorem-ipsum.pdf \n", + "1 2025-02-06T13:54:33.440651 0.617823 spam.pdf \n", + "2 2025-02-06T13:54:31.502460 0.645348 earth2.pdf \n", + "3 2025-02-06T13:54:32.821365 0.664288 mars.pdf \n", + "4 2025-02-06T13:54:29.909555 1.100482 earth-copy.pdf \n", + "5 2025-02-06T13:54:30.855225 0.931613 earth.pdf " ] }, "execution_count": 8, @@ -897,23 +899,23 @@ "name": "stderr", "output_type": "stream", "text": [ - "23:06:22 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'doc_hash', 'int_column': 'int_id_column', 'start_id': 0}\n", - "23:06:22 INFO - pipeline id pipeline_id\n", - "23:06:22 INFO - code location None\n", - "23:06:22 INFO - data factory data_ is using local data access: input_folder - output/01_pdf2pq_out output_folder - output/02_docid_out\n", - "23:06:22 INFO - data factory data_ max_files -1, n_sample -1\n", - "23:06:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:06:22 INFO - orchestrator doc_id started at 2025-02-04 23:06:22\n", - "23:06:22 INFO - Number of files is 6, source profile {'max_file_size': 0.010061264038085938, 'min_file_size': 0.0055408477783203125, 'total_file_size': 0.04969310760498047}\n", - "23:06:22 INFO - Completed 1 files (16.67%) in 0.0 min\n", - "23:06:22 INFO - Completed 2 files (33.33%) in 0.0 min\n", - "23:06:22 INFO - Completed 3 files (50.0%) in 0.0 min\n", - "23:06:22 INFO - Completed 4 files (66.67%) in 0.0 min\n", - "23:06:22 INFO - Completed 5 files (83.33%) in 0.0 min\n", - "23:06:22 INFO - Completed 6 files (100.0%) in 0.0 min\n", - "23:06:22 INFO - Done processing 6 files, waiting for flush() completion.\n", - "23:06:22 INFO - done flushing in 0.0 sec\n", - "23:06:22 INFO - Completed execution in 0.0 min, execution result 0\n" + "13:54:33 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'doc_hash', 'int_column': 'int_id_column', 'start_id': 0}\n", + "13:54:33 INFO - pipeline id pipeline_id\n", + "13:54:33 INFO - code location None\n", + "13:54:33 INFO - data factory data_ is using local data access: input_folder - output/01_pdf2pq_out output_folder - output/02_docid_out\n", + "13:54:33 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:54:33 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:54:33 INFO - orchestrator doc_id started at 2025-02-06 13:54:33\n", + "13:54:33 INFO - Number of files is 6, source profile {'max_file_size': 0.010061264038085938, 'min_file_size': 0.0055408477783203125, 'total_file_size': 0.04969310760498047}\n", + "13:54:33 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "13:54:33 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "13:54:33 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "13:54:33 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "13:54:33 INFO - Completed 5 files (83.33%) in 0.0 min\n", + "13:54:33 INFO - Completed 6 files (100.0%) in 0.0 min\n", + "13:54:33 INFO - Done processing 6 files, waiting for flush() completion.\n", + "13:54:33 INFO - done flushing in 0.0 sec\n", + "13:54:33 INFO - Completed execution in 0.0 min, execution result 0\n" ] }, { @@ -921,8 +923,8 @@ "output_type": "stream", "text": [ "✅ Stage:2 completed successfully\n", - "CPU times: user 27.6 ms, sys: 2.32 ms, total: 29.9 ms\n", - "Wall time: 23.4 ms\n" + "CPU times: user 27 ms, sys: 3.61 ms, total: 30.7 ms\n", + "Wall time: 26.3 ms\n" ] } ], @@ -1027,13 +1029,13 @@ " 1\n", " 0\n", " 2\n", - " 52b1cdf4-b1ef-4375-8e6b-23f174592c06\n", + " 4be2a61e-96f5-4f58-bf6f-e829dbdfa9d3\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 35\n", - " 2025-02-04T23:06:20.470544\n", - " 0.693593\n", + " 2025-02-06T13:54:32.155384\n", + " 0.651216\n", " lorem-ipsum.pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 3\n", @@ -1045,13 +1047,13 @@ " 1\n", " 0\n", " 2\n", - " 854dca5d-9db5-4ea5-b2e5-bddd176bf1b8\n", + " 2bd06750-cb70-4689-b2b8-72913b929a1d\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-02-04T23:06:21.819893\n", - " 0.676735\n", + " 2025-02-06T13:54:33.440651\n", + " 0.617823\n", " spam.pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 5\n", @@ -1063,13 +1065,13 @@ " 1\n", " 0\n", " 11\n", - " 875d0907-8dd3-4ef9-b3b0-a0083e7ad438\n", + " 594034db-1fcd-411b-a89e-d37e4defdfc2\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 610\n", - " 2025-02-04T23:06:19.774915\n", - " 0.641045\n", + " 2025-02-06T13:54:31.502460\n", + " 0.645348\n", " earth2.pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 2\n", @@ -1081,13 +1083,13 @@ " 1\n", " 0\n", " 11\n", - " 6264e62a-0121-4cd4-8202-ea6e228e15f1\n", + " 20ae1424-c2c3-436f-a7ff-b8c69fa3a3c3\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-02-04T23:06:21.141230\n", - " 0.668992\n", + " 2025-02-06T13:54:32.821365\n", + " 0.664288\n", " mars.pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 4\n", @@ -1099,13 +1101,13 @@ " 1\n", " 0\n", " 11\n", - " 582bc53b-96e2-4b09-8dd7-6a27a685a53e\n", + " 4b43fb09-c9ef-4d9a-af24-8e22b5ff33b3\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-02-04T23:06:18.199803\n", - " 1.053618\n", + " 2025-02-06T13:54:29.909555\n", + " 1.100482\n", " earth-copy.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 0\n", @@ -1117,13 +1119,13 @@ " 1\n", " 0\n", " 11\n", - " c6c18475-9365-4325-85dc-8acf6b969d8f\n", + " d1d30fbc-c1e9-4813-a067-085e50b4ee49\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-02-04T23:06:19.132090\n", - " 0.929218\n", + " 2025-02-06T13:54:30.855225\n", + " 0.931613\n", " earth.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 1\n", @@ -1150,12 +1152,12 @@ "5 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 52b1cdf4-b1ef-4375-8e6b-23f174592c06 6571294142213095721 pdf \n", - "1 854dca5d-9db5-4ea5-b2e5-bddd176bf1b8 10026122586747302274 pdf \n", - "2 875d0907-8dd3-4ef9-b3b0-a0083e7ad438 10729312978404042321 pdf \n", - "3 6264e62a-0121-4cd4-8202-ea6e228e15f1 7758129997476962679 pdf \n", - "4 582bc53b-96e2-4b09-8dd7-6a27a685a53e 14711865278795535908 pdf \n", - "5 c6c18475-9365-4325-85dc-8acf6b969d8f 14711865278795535908 pdf \n", + "0 4be2a61e-96f5-4f58-bf6f-e829dbdfa9d3 6571294142213095721 pdf \n", + "1 2bd06750-cb70-4689-b2b8-72913b929a1d 10026122586747302274 pdf \n", + "2 594034db-1fcd-411b-a89e-d37e4defdfc2 10729312978404042321 pdf \n", + "3 20ae1424-c2c3-436f-a7ff-b8c69fa3a3c3 7758129997476962679 pdf \n", + "4 4b43fb09-c9ef-4d9a-af24-8e22b5ff33b3 14711865278795535908 pdf \n", + "5 d1d30fbc-c1e9-4813-a067-085e50b4ee49 14711865278795535908 pdf \n", "\n", " hash size \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", @@ -1166,12 +1168,12 @@ "5 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", " date_acquired pdf_convert_time source_filename \\\n", - "0 2025-02-04T23:06:20.470544 0.693593 lorem-ipsum.pdf \n", - "1 2025-02-04T23:06:21.819893 0.676735 spam.pdf \n", - "2 2025-02-04T23:06:19.774915 0.641045 earth2.pdf \n", - "3 2025-02-04T23:06:21.141230 0.668992 mars.pdf \n", - "4 2025-02-04T23:06:18.199803 1.053618 earth-copy.pdf \n", - "5 2025-02-04T23:06:19.132090 0.929218 earth.pdf \n", + "0 2025-02-06T13:54:32.155384 0.651216 lorem-ipsum.pdf \n", + "1 2025-02-06T13:54:33.440651 0.617823 spam.pdf \n", + "2 2025-02-06T13:54:31.502460 0.645348 earth2.pdf \n", + "3 2025-02-06T13:54:32.821365 0.664288 mars.pdf \n", + "4 2025-02-06T13:54:29.909555 1.100482 earth-copy.pdf \n", + "5 2025-02-06T13:54:30.855225 0.931613 earth.pdf \n", "\n", " doc_hash int_id_column \n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 \n", @@ -1243,24 +1245,24 @@ "name": "stderr", "output_type": "stream", "text": [ - "23:06:22 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'doc_hash', 'use_snapshot': False, 'snapshot_directory': None}\n", - "23:06:22 INFO - pipeline id pipeline_id\n", - "23:06:22 INFO - code location None\n", - "23:06:22 INFO - data factory data_ is using local data access: input_folder - output/02_docid_out output_folder - output/03_exact_dedupe_out\n", - "23:06:22 INFO - data factory data_ max_files -1, n_sample -1\n", - "23:06:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:06:22 INFO - orchestrator ededup started at 2025-02-04 23:06:22\n", - "23:06:22 INFO - Number of files is 6, source profile {'max_file_size': 0.01116180419921875, 'min_file_size': 0.006641387939453125, 'total_file_size': 0.056290626525878906}\n", - "23:06:22 INFO - Starting from the beginning\n", - "23:06:22 INFO - Completed 1 files (16.67%) in 0.0 min\n", - "23:06:22 INFO - Completed 2 files (33.33%) in 0.0 min\n", - "23:06:22 INFO - Completed 3 files (50.0%) in 0.0 min\n", - "23:06:22 INFO - Completed 4 files (66.67%) in 0.0 min\n", - "23:06:22 INFO - Completed 5 files (83.33%) in 0.0 min\n", - "23:06:22 INFO - Completed 6 files (100.0%) in 0.0 min\n", - "23:06:22 INFO - Done processing 6 files, waiting for flush() completion.\n", - "23:06:22 INFO - done flushing in 0.0 sec\n", - "23:06:22 INFO - Completed execution in 0.0 min, execution result 0\n" + "13:54:33 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'doc_hash', 'use_snapshot': False, 'snapshot_directory': None}\n", + "13:54:33 INFO - pipeline id pipeline_id\n", + "13:54:33 INFO - code location None\n", + "13:54:33 INFO - data factory data_ is using local data access: input_folder - output/02_docid_out output_folder - output/03_exact_dedupe_out\n", + "13:54:33 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:54:33 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:54:33 INFO - orchestrator ededup started at 2025-02-06 13:54:33\n", + "13:54:33 INFO - Number of files is 6, source profile {'max_file_size': 0.01116180419921875, 'min_file_size': 0.006641387939453125, 'total_file_size': 0.056290626525878906}\n", + "13:54:33 INFO - Starting from the beginning\n", + "13:54:33 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "13:54:33 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "13:54:33 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "13:54:33 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "13:54:33 INFO - Completed 5 files (83.33%) in 0.0 min\n", + "13:54:33 INFO - Completed 6 files (100.0%) in 0.0 min\n", + "13:54:33 INFO - Done processing 6 files, waiting for flush() completion.\n", + "13:54:33 INFO - done flushing in 0.0 sec\n", + "13:54:33 INFO - Completed execution in 0.0 min, execution result 0\n" ] }, { @@ -1268,8 +1270,8 @@ "output_type": "stream", "text": [ "✅ Stage:3 completed successfully\n", - "CPU times: user 37.3 ms, sys: 3.56 ms, total: 40.9 ms\n", - "Wall time: 36.4 ms\n" + "CPU times: user 25.3 ms, sys: 4.27 ms, total: 29.5 ms\n", + "Wall time: 24.2 ms\n" ] } ], @@ -1375,13 +1377,13 @@ " 1\n", " 0\n", " 2\n", - " 52b1cdf4-b1ef-4375-8e6b-23f174592c06\n", + " 4be2a61e-96f5-4f58-bf6f-e829dbdfa9d3\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 35\n", - " 2025-02-04T23:06:20.470544\n", - " 0.693593\n", + " 2025-02-06T13:54:32.155384\n", + " 0.651216\n", " lorem-ipsum.pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 3\n", @@ -1394,13 +1396,13 @@ " 1\n", " 0\n", " 2\n", - " 854dca5d-9db5-4ea5-b2e5-bddd176bf1b8\n", + " 2bd06750-cb70-4689-b2b8-72913b929a1d\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-02-04T23:06:21.819893\n", - " 0.676735\n", + " 2025-02-06T13:54:33.440651\n", + " 0.617823\n", " spam.pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 5\n", @@ -1413,13 +1415,13 @@ " 1\n", " 0\n", " 11\n", - " 875d0907-8dd3-4ef9-b3b0-a0083e7ad438\n", + " 594034db-1fcd-411b-a89e-d37e4defdfc2\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 610\n", - " 2025-02-04T23:06:19.774915\n", - " 0.641045\n", + " 2025-02-06T13:54:31.502460\n", + " 0.645348\n", " earth2.pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 2\n", @@ -1432,13 +1434,13 @@ " 1\n", " 0\n", " 11\n", - " 6264e62a-0121-4cd4-8202-ea6e228e15f1\n", + " 20ae1424-c2c3-436f-a7ff-b8c69fa3a3c3\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-02-04T23:06:21.141230\n", - " 0.668992\n", + " 2025-02-06T13:54:32.821365\n", + " 0.664288\n", " mars.pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 4\n", @@ -1451,13 +1453,13 @@ " 1\n", " 0\n", " 11\n", - " 582bc53b-96e2-4b09-8dd7-6a27a685a53e\n", + " 4b43fb09-c9ef-4d9a-af24-8e22b5ff33b3\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-02-04T23:06:18.199803\n", - " 1.053618\n", + " 2025-02-06T13:54:29.909555\n", + " 1.100482\n", " earth-copy.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 0\n", @@ -1483,11 +1485,11 @@ "4 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 52b1cdf4-b1ef-4375-8e6b-23f174592c06 6571294142213095721 pdf \n", - "1 854dca5d-9db5-4ea5-b2e5-bddd176bf1b8 10026122586747302274 pdf \n", - "2 875d0907-8dd3-4ef9-b3b0-a0083e7ad438 10729312978404042321 pdf \n", - "3 6264e62a-0121-4cd4-8202-ea6e228e15f1 7758129997476962679 pdf \n", - "4 582bc53b-96e2-4b09-8dd7-6a27a685a53e 14711865278795535908 pdf \n", + "0 4be2a61e-96f5-4f58-bf6f-e829dbdfa9d3 6571294142213095721 pdf \n", + "1 2bd06750-cb70-4689-b2b8-72913b929a1d 10026122586747302274 pdf \n", + "2 594034db-1fcd-411b-a89e-d37e4defdfc2 10729312978404042321 pdf \n", + "3 20ae1424-c2c3-436f-a7ff-b8c69fa3a3c3 7758129997476962679 pdf \n", + "4 4b43fb09-c9ef-4d9a-af24-8e22b5ff33b3 14711865278795535908 pdf \n", "\n", " hash size \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", @@ -1497,11 +1499,11 @@ "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", " date_acquired pdf_convert_time source_filename \\\n", - "0 2025-02-04T23:06:20.470544 0.693593 lorem-ipsum.pdf \n", - "1 2025-02-04T23:06:21.819893 0.676735 spam.pdf \n", - "2 2025-02-04T23:06:19.774915 0.641045 earth2.pdf \n", - "3 2025-02-04T23:06:21.141230 0.668992 mars.pdf \n", - "4 2025-02-04T23:06:18.199803 1.053618 earth-copy.pdf \n", + "0 2025-02-06T13:54:32.155384 0.651216 lorem-ipsum.pdf \n", + "1 2025-02-06T13:54:33.440651 0.617823 spam.pdf \n", + "2 2025-02-06T13:54:31.502460 0.645348 earth2.pdf \n", + "3 2025-02-06T13:54:32.821365 0.664288 mars.pdf \n", + "4 2025-02-06T13:54:29.909555 1.100482 earth-copy.pdf \n", "\n", " doc_hash int_id_column removed \n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] \n", @@ -1596,109 +1598,109 @@ "name": "stderr", "output_type": "stream", "text": [ - "23:06:22 INFO - Starting SignatureCalculation step\n", - "23:06:22 INFO - Got parameters for SignatureCalculation\n", - "23:06:22 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.8, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", - "23:06:22 INFO - data factory scdata_ is using local configuration without input/output path\n", - "23:06:22 INFO - data factory scdata_ max_files -1, n_sample -1\n", - "23:06:22 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:06:22 INFO - pipeline id pipeline_id\n", - "23:06:22 INFO - code location None\n", - "23:06:22 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", - "23:06:22 INFO - data factory data_ max_files -1, n_sample -1\n", - "23:06:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:06:22 INFO - orchestrator minhash started at 2025-02-04 23:06:22\n", - "23:06:22 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", - "23:06:22 INFO - Completed 1 files (16.67%) in 0.0 min\n", - "23:06:22 WARNING - table is empty, skipping processing\n", - "23:06:22 INFO - Completed 2 files (33.33%) in 0.0 min\n", - "23:06:22 INFO - Completed 3 files (50.0%) in 0.0 min\n", - "23:06:22 INFO - Completed 4 files (66.67%) in 0.0 min\n", - "23:06:22 INFO - Completed 5 files (83.33%) in 0.0 min\n", - "23:06:22 INFO - Completed 6 files (100.0%) in 0.001 min\n", - "23:06:22 INFO - Done processing 6 files, waiting for flush() completion.\n", - "23:06:22 INFO - Starting flush()\n", - "23:06:22 INFO - Wrote 14 tables with a total size of 33,600 bytes\n", - "23:06:22 INFO - done flushing in 0.028 sec\n", - "23:06:22 INFO - Completed execution in 0.001 min, execution result 0\n", - "23:06:22 INFO - SignatureCalculation completed successfully\n", - "23:06:22 INFO - Starting ClusterAnalysis step\n", - "23:06:22 INFO - Got parameters for ClusterAnalysis\n", - "23:06:22 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.8, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", - "23:06:22 INFO - pipeline id pipeline_id\n", - "23:06:22 INFO - code location None\n", - "23:06:22 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/bands output_folder - output/04_fuzzy_dedupe_out/docs_to_remove\n", - "23:06:22 INFO - data factory data_ max_files -1, n_sample -1\n", - "23:06:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:06:22 INFO - orchestrator cluster started at 2025-02-04 23:06:22\n", - "23:06:22 INFO - Number of folders is 14\n", - "23:06:22 INFO - Completed 1 files (7.14%) in 0.0 min\n", - "23:06:22 INFO - Completed 2 files (14.29%) in 0.0 min\n", - "23:06:22 INFO - Completed 3 files (21.43%) in 0.0 min\n", - "23:06:22 INFO - Completed 4 files (28.57%) in 0.0 min\n", - "23:06:22 INFO - Completed 5 files (35.71%) in 0.0 min\n", - "23:06:22 INFO - Completed 6 files (42.86%) in 0.0 min\n", - "23:06:22 INFO - Completed 7 files (50.0%) in 0.0 min\n", - "23:06:22 INFO - Completed 8 files (57.14%) in 0.0 min\n", - "23:06:22 INFO - Completed 9 files (64.29%) in 0.001 min\n", - "23:06:22 INFO - Completed 10 files (71.43%) in 0.001 min\n", - "23:06:22 INFO - Completed 11 files (78.57%) in 0.001 min\n", - "23:06:22 INFO - Completed 12 files (85.71%) in 0.001 min\n", - "23:06:22 INFO - Completed 13 files (92.86%) in 0.001 min\n", - "23:06:22 INFO - Completed 14 files (100.0%) in 0.001 min\n", - "23:06:22 INFO - Done processing 14 files, waiting for flush() completion.\n", - "23:06:22 INFO - done flushing in 0.0 sec\n", - "23:06:22 INFO - Completed execution in 0.001 min, execution result 0\n", - "23:06:22 INFO - ClusterAnalysis completed successfully\n", - "23:06:22 INFO - Starting GetDuplicateList step\n", - "23:06:22 INFO - Got parameters for GetDuplicateList\n", - "23:06:22 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", - "23:06:22 INFO - pipeline id pipeline_id\n", - "23:06:22 INFO - code location None\n", - "23:06:22 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", - "23:06:22 INFO - data factory data_ max_files -1, n_sample -1\n", - "23:06:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:06:22 INFO - orchestrator fdlist started at 2025-02-04 23:06:22\n", - "23:06:22 INFO - Number of folders is 1\n", - "23:06:22 INFO - Get Duplicate List for folder docs_to_remove\n", - "23:06:22 INFO - 1 documents marked as duplicates\n", - "23:06:22 INFO - Completed 1 files (100.0%) in 0.0 min\n", - "23:06:22 INFO - Done processing 1 files, waiting for flush() completion.\n", - "23:06:22 INFO - done flushing in 0.0 sec\n", - "23:06:22 INFO - Completed execution in 0.0 min, execution result 0\n", - "23:06:22 INFO - GetDuplicateList completed successfully\n", - "23:06:22 INFO - Starting DataCleaning step\n", - "23:06:22 INFO - Got parameters for DataCleaning\n", - "23:06:22 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", - "23:06:22 INFO - data factory dcdata_ is using local configuration without input/output path\n", - "23:06:22 INFO - data factory dcdata_ max_files -1, n_sample -1\n", - "23:06:22 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:06:22 INFO - pipeline id pipeline_id\n", - "23:06:22 INFO - code location None\n", - "23:06:22 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out/cleaned\n", - "23:06:22 INFO - data factory data_ max_files -1, n_sample -1\n", - "23:06:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:06:22 INFO - orchestrator fdclean started at 2025-02-04 23:06:22\n", - "23:06:22 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", - "23:06:22 INFO - Completed 1 files (16.67%) in 0.0 min\n", - "23:06:22 WARNING - table is empty, skipping processing\n", - "23:06:22 INFO - Completed 2 files (33.33%) in 0.0 min\n", - "23:06:22 INFO - Completed 3 files (50.0%) in 0.0 min\n", - "23:06:22 INFO - Completed 4 files (66.67%) in 0.0 min\n", - "23:06:22 INFO - Completed 5 files (83.33%) in 0.001 min\n", - "23:06:22 INFO - Completed 6 files (100.0%) in 0.001 min\n", - "23:06:22 INFO - Done processing 6 files, waiting for flush() completion.\n", - "23:06:22 INFO - done flushing in 0.0 sec\n", - "23:06:22 INFO - Completed execution in 0.001 min, execution result 0\n", - "23:06:22 INFO - DataCleaning completed successfully\n" + "13:54:33 INFO - Starting SignatureCalculation step\n", + "13:54:33 INFO - Got parameters for SignatureCalculation\n", + "13:54:33 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.8, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", + "13:54:33 INFO - data factory scdata_ is using local configuration without input/output path\n", + "13:54:33 INFO - data factory scdata_ max_files -1, n_sample -1\n", + "13:54:33 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:54:33 INFO - pipeline id pipeline_id\n", + "13:54:33 INFO - code location None\n", + "13:54:33 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", + "13:54:33 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:54:33 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:54:33 INFO - orchestrator minhash started at 2025-02-06 13:54:33\n", + "13:54:33 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", + "13:54:33 INFO - Completed 1 files (16.67%) in 0.001 min\n", + "13:54:33 WARNING - table is empty, skipping processing\n", + "13:54:33 INFO - Completed 2 files (33.33%) in 0.001 min\n", + "13:54:33 INFO - Completed 3 files (50.0%) in 0.001 min\n", + "13:54:33 INFO - Completed 4 files (66.67%) in 0.001 min\n", + "13:54:33 INFO - Completed 5 files (83.33%) in 0.001 min\n", + "13:54:33 INFO - Completed 6 files (100.0%) in 0.001 min\n", + "13:54:33 INFO - Done processing 6 files, waiting for flush() completion.\n", + "13:54:33 INFO - Starting flush()\n", + "13:54:33 INFO - Wrote 14 tables with a total size of 33,600 bytes\n", + "13:54:33 INFO - done flushing in 0.031 sec\n", + "13:54:33 INFO - Completed execution in 0.001 min, execution result 0\n", + "13:54:33 INFO - SignatureCalculation completed successfully\n", + "13:54:33 INFO - Starting ClusterAnalysis step\n", + "13:54:33 INFO - Got parameters for ClusterAnalysis\n", + "13:54:33 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.8, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", + "13:54:33 INFO - pipeline id pipeline_id\n", + "13:54:33 INFO - code location None\n", + "13:54:33 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/bands output_folder - output/04_fuzzy_dedupe_out/docs_to_remove\n", + "13:54:33 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:54:33 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:54:33 INFO - orchestrator cluster started at 2025-02-06 13:54:33\n", + "13:54:33 INFO - Number of folders is 14\n", + "13:54:33 INFO - Completed 1 files (7.14%) in 0.0 min\n", + "13:54:33 INFO - Completed 2 files (14.29%) in 0.0 min\n", + "13:54:33 INFO - Completed 3 files (21.43%) in 0.0 min\n", + "13:54:33 INFO - Completed 4 files (28.57%) in 0.0 min\n", + "13:54:33 INFO - Completed 5 files (35.71%) in 0.0 min\n", + "13:54:33 INFO - Completed 6 files (42.86%) in 0.0 min\n", + "13:54:33 INFO - Completed 7 files (50.0%) in 0.001 min\n", + "13:54:34 INFO - Completed 8 files (57.14%) in 0.001 min\n", + "13:54:34 INFO - Completed 9 files (64.29%) in 0.001 min\n", + "13:54:34 INFO - Completed 10 files (71.43%) in 0.001 min\n", + "13:54:34 INFO - Completed 11 files (78.57%) in 0.001 min\n", + "13:54:34 INFO - Completed 12 files (85.71%) in 0.001 min\n", + "13:54:34 INFO - Completed 13 files (92.86%) in 0.001 min\n", + "13:54:34 INFO - Completed 14 files (100.0%) in 0.001 min\n", + "13:54:34 INFO - Done processing 14 files, waiting for flush() completion.\n", + "13:54:34 INFO - done flushing in 0.0 sec\n", + "13:54:34 INFO - Completed execution in 0.001 min, execution result 0\n", + "13:54:34 INFO - ClusterAnalysis completed successfully\n", + "13:54:34 INFO - Starting GetDuplicateList step\n", + "13:54:34 INFO - Got parameters for GetDuplicateList\n", + "13:54:34 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", + "13:54:34 INFO - pipeline id pipeline_id\n", + "13:54:34 INFO - code location None\n", + "13:54:34 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", + "13:54:34 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:54:34 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:54:34 INFO - orchestrator fdlist started at 2025-02-06 13:54:34\n", + "13:54:34 INFO - Number of folders is 1\n", + "13:54:34 INFO - Get Duplicate List for folder docs_to_remove\n", + "13:54:34 INFO - 1 documents marked as duplicates\n", + "13:54:34 INFO - Completed 1 files (100.0%) in 0.0 min\n", + "13:54:34 INFO - Done processing 1 files, waiting for flush() completion.\n", + "13:54:34 INFO - done flushing in 0.0 sec\n", + "13:54:34 INFO - Completed execution in 0.0 min, execution result 0\n", + "13:54:34 INFO - GetDuplicateList completed successfully\n", + "13:54:34 INFO - Starting DataCleaning step\n", + "13:54:34 INFO - Got parameters for DataCleaning\n", + "13:54:34 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", + "13:54:34 INFO - data factory dcdata_ is using local configuration without input/output path\n", + "13:54:34 INFO - data factory dcdata_ max_files -1, n_sample -1\n", + "13:54:34 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:54:34 INFO - pipeline id pipeline_id\n", + "13:54:34 INFO - code location None\n", + "13:54:34 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out/cleaned\n", + "13:54:34 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:54:34 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:54:34 INFO - orchestrator fdclean started at 2025-02-06 13:54:34\n", + "13:54:34 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", + "13:54:34 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "13:54:34 WARNING - table is empty, skipping processing\n", + "13:54:34 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "13:54:34 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "13:54:34 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "13:54:34 INFO - Completed 5 files (83.33%) in 0.0 min\n", + "13:54:34 INFO - Completed 6 files (100.0%) in 0.0 min\n", + "13:54:34 INFO - Done processing 6 files, waiting for flush() completion.\n", + "13:54:34 INFO - done flushing in 0.0 sec\n", + "13:54:34 INFO - Completed execution in 0.0 min, execution result 0\n", + "13:54:34 INFO - DataCleaning completed successfully\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 288 ms, sys: 114 ms, total: 402 ms\n", - "Wall time: 262 ms\n" + "CPU times: user 224 ms, sys: 129 ms, total: 353 ms\n", + "Wall time: 290 ms\n" ] } ], @@ -1812,13 +1814,13 @@ " 1\n", " 0\n", " 2\n", - " 52b1cdf4-b1ef-4375-8e6b-23f174592c06\n", + " 4be2a61e-96f5-4f58-bf6f-e829dbdfa9d3\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 35\n", - " 2025-02-04T23:06:20.470544\n", - " 0.693593\n", + " 2025-02-06T13:54:32.155384\n", + " 0.651216\n", " lorem-ipsum.pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 3\n", @@ -1831,13 +1833,13 @@ " 1\n", " 0\n", " 2\n", - " 854dca5d-9db5-4ea5-b2e5-bddd176bf1b8\n", + " 2bd06750-cb70-4689-b2b8-72913b929a1d\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-02-04T23:06:21.819893\n", - " 0.676735\n", + " 2025-02-06T13:54:33.440651\n", + " 0.617823\n", " spam.pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 5\n", @@ -1850,13 +1852,13 @@ " 1\n", " 0\n", " 11\n", - " 6264e62a-0121-4cd4-8202-ea6e228e15f1\n", + " 20ae1424-c2c3-436f-a7ff-b8c69fa3a3c3\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-02-04T23:06:21.141230\n", - " 0.668992\n", + " 2025-02-06T13:54:32.821365\n", + " 0.664288\n", " mars.pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 4\n", @@ -1869,13 +1871,13 @@ " 1\n", " 0\n", " 11\n", - " 582bc53b-96e2-4b09-8dd7-6a27a685a53e\n", + " 4b43fb09-c9ef-4d9a-af24-8e22b5ff33b3\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-02-04T23:06:18.199803\n", - " 1.053618\n", + " 2025-02-06T13:54:29.909555\n", + " 1.100482\n", " earth-copy.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 0\n", @@ -1899,10 +1901,10 @@ "3 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 52b1cdf4-b1ef-4375-8e6b-23f174592c06 6571294142213095721 pdf \n", - "1 854dca5d-9db5-4ea5-b2e5-bddd176bf1b8 10026122586747302274 pdf \n", - "2 6264e62a-0121-4cd4-8202-ea6e228e15f1 7758129997476962679 pdf \n", - "3 582bc53b-96e2-4b09-8dd7-6a27a685a53e 14711865278795535908 pdf \n", + "0 4be2a61e-96f5-4f58-bf6f-e829dbdfa9d3 6571294142213095721 pdf \n", + "1 2bd06750-cb70-4689-b2b8-72913b929a1d 10026122586747302274 pdf \n", + "2 20ae1424-c2c3-436f-a7ff-b8c69fa3a3c3 7758129997476962679 pdf \n", + "3 4b43fb09-c9ef-4d9a-af24-8e22b5ff33b3 14711865278795535908 pdf \n", "\n", " hash size \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", @@ -1911,10 +1913,10 @@ "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", " date_acquired pdf_convert_time source_filename \\\n", - "0 2025-02-04T23:06:20.470544 0.693593 lorem-ipsum.pdf \n", - "1 2025-02-04T23:06:21.819893 0.676735 spam.pdf \n", - "2 2025-02-04T23:06:21.141230 0.668992 mars.pdf \n", - "3 2025-02-04T23:06:18.199803 1.053618 earth-copy.pdf \n", + "0 2025-02-06T13:54:32.155384 0.651216 lorem-ipsum.pdf \n", + "1 2025-02-06T13:54:33.440651 0.617823 spam.pdf \n", + "2 2025-02-06T13:54:32.821365 0.664288 mars.pdf \n", + "3 2025-02-06T13:54:29.909555 1.100482 earth-copy.pdf \n", "\n", " doc_hash int_id_column removed \n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] \n", @@ -1994,27 +1996,27 @@ "name": "stderr", "output_type": "stream", "text": [ - "23:06:22 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': '/home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", - "23:06:22 INFO - data factory docq_ is using local configuration without input/output path\n", - "23:06:22 INFO - data factory docq_ max_files -1, n_sample -1\n", - "23:06:22 INFO - data factory docq_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:06:22 INFO - pipeline id pipeline_id\n", - "23:06:22 INFO - code location None\n", - "23:06:22 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/cleaned output_folder - output/05_doc_quality_out\n", - "23:06:22 INFO - data factory data_ max_files -1, n_sample -1\n", - "23:06:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:06:22 INFO - orchestrator docq started at 2025-02-04 23:06:22\n", - "23:06:22 INFO - Number of files is 5, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.0035142898559570312, 'total_file_size': 0.040172576904296875}\n", - "23:06:22 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n", - "23:06:22 INFO - Completed 1 files (20.0%) in 0.0 min\n", - "23:06:22 WARNING - table is empty, skipping processing\n", - "23:06:22 INFO - Completed 2 files (40.0%) in 0.0 min\n", - "23:06:22 INFO - Completed 3 files (60.0%) in 0.0 min\n", - "23:06:22 INFO - Completed 4 files (80.0%) in 0.0 min\n", - "23:06:22 INFO - Completed 5 files (100.0%) in 0.0 min\n", - "23:06:22 INFO - Done processing 5 files, waiting for flush() completion.\n", - "23:06:22 INFO - done flushing in 0.0 sec\n", - "23:06:22 INFO - Completed execution in 0.0 min, execution result 0\n" + "13:54:34 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': '/home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", + "13:54:34 INFO - data factory docq_ is using local configuration without input/output path\n", + "13:54:34 INFO - data factory docq_ max_files -1, n_sample -1\n", + "13:54:34 INFO - data factory docq_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:54:34 INFO - pipeline id pipeline_id\n", + "13:54:34 INFO - code location None\n", + "13:54:34 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/cleaned output_folder - output/05_doc_quality_out\n", + "13:54:34 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:54:34 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:54:34 INFO - orchestrator docq started at 2025-02-06 13:54:34\n", + "13:54:34 INFO - Number of files is 5, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.0035142898559570312, 'total_file_size': 0.040172576904296875}\n", + "13:54:34 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n", + "13:54:34 INFO - Completed 1 files (20.0%) in 0.0 min\n", + "13:54:34 WARNING - table is empty, skipping processing\n", + "13:54:34 INFO - Completed 2 files (40.0%) in 0.0 min\n", + "13:54:34 INFO - Completed 3 files (60.0%) in 0.0 min\n", + "13:54:34 INFO - Completed 4 files (80.0%) in 0.0 min\n", + "13:54:34 INFO - Completed 5 files (100.0%) in 0.0 min\n", + "13:54:34 INFO - Done processing 5 files, waiting for flush() completion.\n", + "13:54:34 INFO - done flushing in 0.0 sec\n", + "13:54:34 INFO - Completed execution in 0.0 min, execution result 0\n" ] }, { @@ -2022,8 +2024,8 @@ "output_type": "stream", "text": [ "✅ Stage:5 completed successfully\n", - "CPU times: user 41.8 ms, sys: 1.98 ms, total: 43.7 ms\n", - "Wall time: 36.3 ms\n" + "CPU times: user 37 ms, sys: 3.43 ms, total: 40.4 ms\n", + "Wall time: 36 ms\n" ] } ], @@ -2138,7 +2140,7 @@ " 1\n", " 0\n", " 2\n", - " 52b1cdf4-b1ef-4375-8e6b-23f174592c06\n", + " 4be2a61e-96f5-4f58-bf6f-e829dbdfa9d3\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", @@ -2162,7 +2164,7 @@ " 1\n", " 0\n", " 2\n", - " 854dca5d-9db5-4ea5-b2e5-bddd176bf1b8\n", + " 2bd06750-cb70-4689-b2b8-72913b929a1d\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", @@ -2186,7 +2188,7 @@ " 1\n", " 0\n", " 11\n", - " 6264e62a-0121-4cd4-8202-ea6e228e15f1\n", + " 20ae1424-c2c3-436f-a7ff-b8c69fa3a3c3\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", @@ -2210,7 +2212,7 @@ " 1\n", " 0\n", " 11\n", - " 582bc53b-96e2-4b09-8dd7-6a27a685a53e\n", + " 4b43fb09-c9ef-4d9a-af24-8e22b5ff33b3\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", @@ -2246,10 +2248,10 @@ "3 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 52b1cdf4-b1ef-4375-8e6b-23f174592c06 6571294142213095721 pdf \n", - "1 854dca5d-9db5-4ea5-b2e5-bddd176bf1b8 10026122586747302274 pdf \n", - "2 6264e62a-0121-4cd4-8202-ea6e228e15f1 7758129997476962679 pdf \n", - "3 582bc53b-96e2-4b09-8dd7-6a27a685a53e 14711865278795535908 pdf \n", + "0 4be2a61e-96f5-4f58-bf6f-e829dbdfa9d3 6571294142213095721 pdf \n", + "1 2bd06750-cb70-4689-b2b8-72913b929a1d 10026122586747302274 pdf \n", + "2 20ae1424-c2c3-436f-a7ff-b8c69fa3a3c3 7758129997476962679 pdf \n", + "3 4b43fb09-c9ef-4d9a-af24-8e22b5ff33b3 14711865278795535908 pdf \n", "\n", " hash size ... \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 ... \n", @@ -2376,7 +2378,7 @@ " 1\n", " 0\n", " 11\n", - " 6264e62a-0121-4cd4-8202-ea6e228e15f1\n", + " 20ae1424-c2c3-436f-a7ff-b8c69fa3a3c3\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", @@ -2400,7 +2402,7 @@ " 1\n", " 0\n", " 11\n", - " 582bc53b-96e2-4b09-8dd7-6a27a685a53e\n", + " 4b43fb09-c9ef-4d9a-af24-8e22b5ff33b3\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", @@ -2432,8 +2434,8 @@ "3 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "2 6264e62a-0121-4cd4-8202-ea6e228e15f1 7758129997476962679 pdf \n", - "3 582bc53b-96e2-4b09-8dd7-6a27a685a53e 14711865278795535908 pdf \n", + "2 20ae1424-c2c3-436f-a7ff-b8c69fa3a3c3 7758129997476962679 pdf \n", + "3 4b43fb09-c9ef-4d9a-af24-8e22b5ff33b3 14711865278795535908 pdf \n", "\n", " hash size ... \\\n", "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 ... \n", diff --git a/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb b/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb index 04ed0fad4..129834ad5 100644 --- a/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb +++ b/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb @@ -9,6 +9,8 @@ "source": [ "# Processing PDFs using Data Prep Kit (Ray version)\n", "\n", + " [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IBM/data-prep-kit/blob/dev/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb)\n", + "\n", "This notebook will introduce DPK and showcase some of it's capabilities.\n", "\n", "Here is the workflow:\n", @@ -19,7 +21,7 @@ "- fuzzy dedupe : filter out 'near duplicates'\n", "- document quality: scoring documents for quality\n", "\n", - "![](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/images/data-prep-kit-3-workflow.png)\n" + "![](https://raw.githubusercontent.com/IBM/data-prep-kit/dev//examples/notebooks/pdf-processing-1/images/data-prep-kit-3-workflow.png)\n" ] }, { @@ -33,7 +35,7 @@ "\n", "Two options:\n", "\n", - "- **Option 1 - Google Colab:** easiest option. no setup required. Click this link to open this on google colab. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sujee/data-prep-kit/blob/process-pdf-1/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb)\n", + "- **Option 1 - Google Colab:** easiest option. no setup required. Click this link to open this on google colab. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IBM/data-prep-kit/blob/dev/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb)\n", "- **Option 2 - Local python dev environment:** Setup using this [guide](../../../README.md#-getting-started)\n", "\n", "The notebook will work as in both environments" @@ -298,11 +300,11 @@ "\n", "We will use simple PDFs. The files are [here](https://github.com/IBM/data-prep-kit/tree/dev/examples/notebooks/pdf-processing-1/)\n", "\n", - "- [earth.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/earth.pdf) and exact duplicate [earth-copy.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/earth-copy.pdf)\n", - "- [earth2.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/earth2.pdf) almost similar to earth.pdf (ONE word difference!)\n", - "- [mars.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/mars.pdf)\n", - "- [spam.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/spam.pdf) - contains spammy contents\n", - "- [lorem-ipsum.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/lorem-ipsum.pdf) - contains 'lorem ipsum' placeholder\n" + "- [earth.pdf](https://raw.githubusercontent.com/IBM/data-prep-kit/dev//examples/notebooks/pdf-processing-1/earth.pdf) and exact duplicate [earth-copy.pdf](https://raw.githubusercontent.com/IBM/data-prep-kit/dev//examples/notebooks/pdf-processing-1/earth-copy.pdf)\n", + "- [earth2.pdf](https://raw.githubusercontent.com/IBM/data-prep-kit/dev//examples/notebooks/pdf-processing-1/earth2.pdf) almost similar to earth.pdf (ONE word difference!)\n", + "- [mars.pdf](https://raw.githubusercontent.com/IBM/data-prep-kit/dev//examples/notebooks/pdf-processing-1/mars.pdf)\n", + "- [spam.pdf](https://raw.githubusercontent.com/IBM/data-prep-kit/dev//examples/notebooks/pdf-processing-1/spam.pdf) - contains spammy contents\n", + "- [lorem-ipsum.pdf](https://raw.githubusercontent.com/IBM/data-prep-kit/dev//examples/notebooks/pdf-processing-1/lorem-ipsum.pdf) - contains 'lorem ipsum' placeholder\n" ] }, { @@ -330,17 +332,17 @@ "source": [ "if RUNNING_IN_COLAB:\n", "\n", - " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/earth.pdf', os.path.join(input_dir, 'earth.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/IBM/data-prep-kit/dev//examples/data-files/pdf-processing-1/earth.pdf', os.path.join(input_dir, 'earth.pdf'))\n", "\n", - " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/earth-copy.pdf', os.path.join(input_dir, 'earth-copy.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/IBM/data-prep-kit/dev//examples/data-files/pdf-processing-1/earth-copy.pdf', os.path.join(input_dir, 'earth-copy.pdf'))\n", "\n", - " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/earth2.pdf', os.path.join(input_dir, 'earth2.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/IBM/data-prep-kit/dev//examples/data-files/pdf-processing-1/earth2.pdf', os.path.join(input_dir, 'earth2.pdf'))\n", "\n", - " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/mars.pdf', os.path.join(input_dir, 'mars.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/IBM/data-prep-kit/dev//examples/data-files/pdf-processing-1/mars.pdf', os.path.join(input_dir, 'mars.pdf'))\n", "\n", - " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/spam.pdf', os.path.join(input_dir, 'spam.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/IBM/data-prep-kit/dev//examples/data-files/pdf-processing-1/spam.pdf', os.path.join(input_dir, 'spam.pdf'))\n", "\n", - " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/lorem-ipsum.pdf', os.path.join(input_dir, 'lorem-ipsum.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/IBM/data-prep-kit/dev//examples/data-files/pdf-processing-1/lorem-ipsum.pdf', os.path.join(input_dir, 'lorem-ipsum.pdf'))\n", "else:\n", " print ('Using input files from : ', input_dir)" ] @@ -385,35 +387,35 @@ "name": "stderr", "output_type": "stream", "text": [ - "23:08:37 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 8}\n", - "23:08:37 INFO - pipeline id pipeline_id\n", - "23:08:37 INFO - code location None\n", - "23:08:37 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", - "23:08:37 INFO - actor creation delay 0\n", - "23:08:37 INFO - job details {'job category': 'preprocessing', 'job name': 'pdf2parquet', 'job type': 'ray', 'job id': 'job_id'}\n", - "23:08:37 INFO - data factory data_ is using local data access: input_folder - ../../data-files/pdf-processing-1/ output_folder - output/01_pdf2pq_out\n", - "23:08:37 INFO - data factory data_ max_files -1, n_sample -1\n", - "23:08:37 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", - "23:08:37 INFO - Running locally\n", - "2025-02-04 23:08:38,509\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:42 INFO - orchestrator started at 2025-02-04 23:08:42\n", - "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:42 INFO - Number of files is 6, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.023715972900390625, 'total_file_size': 0.2709054946899414}\n", - "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:42 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.743361664935946, 'object_store': 4.371680831536651}\n", - "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:42 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", - "\u001b[36m(RayTransformFileProcessor pid=2171540)\u001b[0m 23:08:45 INFO - Initializing models\n", - "Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 1688.38it/s]\n", - "\u001b[36m(RayTransformFileProcessor pid=2171540)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", - "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:54 INFO - Completed 1 files in 0.031 min\n", - "\u001b[36m(RayTransformFileProcessor pid=2171541)\u001b[0m 23:08:45 INFO - Initializing models\n", - "Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 29723.41it/s]\n", - "\u001b[36m(RayTransformFileProcessor pid=2171541)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", - "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:54 INFO - Completed 2 files in 0.033 min\n", - "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:56 INFO - Completed 3 files in 0.062 min\n", - "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:56 INFO - Completed 4 files in 0.064 min\n", - "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:56 INFO - Completed 4 files (66.667%) in 0.064 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:58 INFO - Completed processing 6 files in 0.09 min\n", - "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:58 INFO - done flushing in 0.001 sec\n", - "23:09:08 INFO - Completed execution in 0.518 min, execution result 0\n" + "14:19:07 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 8}\n", + "14:19:07 INFO - pipeline id pipeline_id\n", + "14:19:07 INFO - code location None\n", + "14:19:07 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", + "14:19:07 INFO - actor creation delay 0\n", + "14:19:07 INFO - job details {'job category': 'preprocessing', 'job name': 'pdf2parquet', 'job type': 'ray', 'job id': 'job_id'}\n", + "14:19:07 INFO - data factory data_ is using local data access: input_folder - ../../data-files/pdf-processing-1/ output_folder - output/01_pdf2pq_out\n", + "14:19:07 INFO - data factory data_ max_files -1, n_sample -1\n", + "14:19:07 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", + "14:19:07 INFO - Running locally\n", + "2025-02-06 14:19:10,047\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=3257941)\u001b[0m 14:19:14 INFO - orchestrator started at 2025-02-06 14:19:14\n", + "\u001b[36m(orchestrate pid=3257941)\u001b[0m 14:19:14 INFO - Number of files is 6, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.023715972900390625, 'total_file_size': 0.2709054946899414}\n", + "\u001b[36m(orchestrate pid=3257941)\u001b[0m 14:19:14 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 7.187278747558594, 'object_store': 3.593639373779297}\n", + "\u001b[36m(orchestrate pid=3257941)\u001b[0m 14:19:14 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", + "\u001b[36m(RayTransformFileProcessor pid=3258905)\u001b[0m 14:19:18 INFO - Initializing models\n", + "Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 34505.24it/s]\n", + "\u001b[36m(RayTransformFileProcessor pid=3258905)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", + "\u001b[36m(orchestrate pid=3257941)\u001b[0m 14:19:27 INFO - Completed 1 files in 0.035 min\n", + "\u001b[36m(orchestrate pid=3257941)\u001b[0m 14:19:27 INFO - Completed 2 files in 0.035 min\n", + "\u001b[36m(RayTransformFileProcessor pid=3258906)\u001b[0m 14:19:18 INFO - Initializing models\n", + "Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 21207.16it/s]\n", + "\u001b[36m(RayTransformFileProcessor pid=3258906)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", + "\u001b[36m(orchestrate pid=3257941)\u001b[0m 14:19:29 INFO - Completed 3 files in 0.066 min\n", + "\u001b[36m(orchestrate pid=3257941)\u001b[0m 14:19:29 INFO - Completed 4 files in 0.067 min\n", + "\u001b[36m(orchestrate pid=3257941)\u001b[0m 14:19:29 INFO - Completed 4 files (66.667%) in 0.067 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=3257941)\u001b[0m 14:19:30 INFO - Completed processing 6 files in 0.093 min\n", + "\u001b[36m(orchestrate pid=3257941)\u001b[0m 14:19:30 INFO - done flushing in 0.001 sec\n", + "14:19:40 INFO - Completed execution in 0.557 min, execution result 0\n" ] }, { @@ -517,13 +519,13 @@ " 1\n", " 0\n", " 2\n", - " 3618834f-9dfc-49a1-9066-e2724df95fec\n", + " 8dc8970e-215a-44fe-a7bf-946c03f36c60\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 35\n", - " 2025-02-04T23:08:56.820444\n", - " 1.846058\n", + " 2025-02-06T14:19:29.408910\n", + " 1.912304\n", " lorem-ipsum.pdf\n", " \n", " \n", @@ -533,13 +535,13 @@ " 1\n", " 0\n", " 2\n", - " 27880888-8e1a-4b46-a6a9-fecba8eee0eb\n", + " 9ac78463-b325-406b-891e-c9e84722eb34\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-02-04T23:08:58.414120\n", - " 1.590731\n", + " 2025-02-06T14:19:30.986464\n", + " 1.573836\n", " spam.pdf\n", " \n", " \n", @@ -549,13 +551,13 @@ " 1\n", " 0\n", " 11\n", - " 84f59118-2a64-4d4b-991c-10ca09576a74\n", + " b3ed1942-54a6-49fc-bcbc-2d8c438adef3\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 610\n", - " 2025-02-04T23:08:56.713495\n", - " 1.827202\n", + " 2025-02-06T14:19:29.335271\n", + " 1.850426\n", " earth2.pdf\n", " \n", " \n", @@ -565,13 +567,13 @@ " 1\n", " 0\n", " 11\n", - " 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff\n", + " 6d882651-2506-41cb-8704-85575c64b143\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-02-04T23:08:58.272496\n", - " 1.547326\n", + " 2025-02-06T14:19:30.950673\n", + " 1.612200\n", " mars.pdf\n", " \n", " \n", @@ -581,13 +583,13 @@ " 1\n", " 0\n", " 11\n", - " 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44\n", + " f8ccec16-576c-4e3e-8bec-359dff01d6d2\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-02-04T23:08:54.872145\n", - " 1.864833\n", + " 2025-02-06T14:19:27.470409\n", + " 2.071769\n", " earth-copy.pdf\n", " \n", " \n", @@ -597,13 +599,13 @@ " 1\n", " 0\n", " 11\n", - " 5b12b0e8-946f-4538-8812-9ee74204c2d7\n", + " 18d940f3-f4b4-46ac-9147-077675aead1d\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-02-04T23:08:54.969828\n", - " 1.962273\n", + " 2025-02-06T14:19:27.492574\n", + " 2.093768\n", " earth.pdf\n", " \n", " \n", @@ -628,12 +630,12 @@ "5 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 3618834f-9dfc-49a1-9066-e2724df95fec 6571294142213095721 pdf \n", - "1 27880888-8e1a-4b46-a6a9-fecba8eee0eb 10026122586747302274 pdf \n", - "2 84f59118-2a64-4d4b-991c-10ca09576a74 10729312978404042321 pdf \n", - "3 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff 7758129997476962679 pdf \n", - "4 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44 14711865278795535908 pdf \n", - "5 5b12b0e8-946f-4538-8812-9ee74204c2d7 14711865278795535908 pdf \n", + "0 8dc8970e-215a-44fe-a7bf-946c03f36c60 6571294142213095721 pdf \n", + "1 9ac78463-b325-406b-891e-c9e84722eb34 10026122586747302274 pdf \n", + "2 b3ed1942-54a6-49fc-bcbc-2d8c438adef3 10729312978404042321 pdf \n", + "3 6d882651-2506-41cb-8704-85575c64b143 7758129997476962679 pdf \n", + "4 f8ccec16-576c-4e3e-8bec-359dff01d6d2 14711865278795535908 pdf \n", + "5 18d940f3-f4b4-46ac-9147-077675aead1d 14711865278795535908 pdf \n", "\n", " hash size \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", @@ -644,12 +646,12 @@ "5 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", " date_acquired pdf_convert_time source_filename \n", - "0 2025-02-04T23:08:56.820444 1.846058 lorem-ipsum.pdf \n", - "1 2025-02-04T23:08:58.414120 1.590731 spam.pdf \n", - "2 2025-02-04T23:08:56.713495 1.827202 earth2.pdf \n", - "3 2025-02-04T23:08:58.272496 1.547326 mars.pdf \n", - "4 2025-02-04T23:08:54.872145 1.864833 earth-copy.pdf \n", - "5 2025-02-04T23:08:54.969828 1.962273 earth.pdf " + "0 2025-02-06T14:19:29.408910 1.912304 lorem-ipsum.pdf \n", + "1 2025-02-06T14:19:30.986464 1.573836 spam.pdf \n", + "2 2025-02-06T14:19:29.335271 1.850426 earth2.pdf \n", + "3 2025-02-06T14:19:30.950673 1.612200 mars.pdf \n", + "4 2025-02-06T14:19:27.470409 2.071769 earth-copy.pdf \n", + "5 2025-02-06T14:19:27.492574 2.093768 earth.pdf " ] }, "execution_count": 9, @@ -802,29 +804,29 @@ "name": "stderr", "output_type": "stream", "text": [ - "23:09:09 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'doc_hash', 'int_column': 'int_id_column', 'start_id': 0}\n", - "23:09:09 INFO - pipeline id pipeline_id\n", - "23:09:09 INFO - code location None\n", - "23:09:09 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", - "23:09:09 INFO - actor creation delay 0\n", - "23:09:09 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_id', 'job type': 'ray', 'job id': 'job_id'}\n", - "23:09:09 INFO - data factory data_ is using local data access: input_folder - output/01_pdf2pq_out output_folder - output/02_docid_out\n", - "23:09:09 INFO - data factory data_ max_files -1, n_sample -1\n", - "23:09:09 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:09:09 INFO - Running locally\n", - "2025-02-04 23:09:10,988\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:12 INFO - orchestrator started at 2025-02-04 23:09:12\n", - "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:12 INFO - Number of files is 6, source profile {'max_file_size': 0.010061264038085938, 'min_file_size': 0.0055408477783203125, 'total_file_size': 0.04969310760498047}\n", - "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:12 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.800101472064853, 'object_store': 4.400050735101104}\n", - "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:12 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:13 INFO - Completed 1 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:13 INFO - Completed 2 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:13 INFO - Completed 3 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:13 INFO - Completed 4 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:13 INFO - Completed 4 files (66.667%) in 0.003 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:13 INFO - Completed processing 6 files in 0.004 min\n", - "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:13 INFO - done flushing in 0.001 sec\n", - "23:09:23 INFO - Completed execution in 0.226 min, execution result 0\n" + "14:19:42 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'doc_hash', 'int_column': 'int_id_column', 'start_id': 0}\n", + "14:19:42 INFO - pipeline id pipeline_id\n", + "14:19:42 INFO - code location None\n", + "14:19:42 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", + "14:19:42 INFO - actor creation delay 0\n", + "14:19:42 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_id', 'job type': 'ray', 'job id': 'job_id'}\n", + "14:19:42 INFO - data factory data_ is using local data access: input_folder - output/01_pdf2pq_out output_folder - output/02_docid_out\n", + "14:19:42 INFO - data factory data_ max_files -1, n_sample -1\n", + "14:19:42 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "14:19:42 INFO - Running locally\n", + "2025-02-06 14:19:43,706\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=3259648)\u001b[0m 14:19:45 INFO - orchestrator started at 2025-02-06 14:19:45\n", + "\u001b[36m(orchestrate pid=3259648)\u001b[0m 14:19:45 INFO - Number of files is 6, source profile {'max_file_size': 0.010061264038085938, 'min_file_size': 0.0055408477783203125, 'total_file_size': 0.04969310760498047}\n", + "\u001b[36m(orchestrate pid=3259648)\u001b[0m 14:19:45 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 7.290360260754824, 'object_store': 3.6451801294460893}\n", + "\u001b[36m(orchestrate pid=3259648)\u001b[0m 14:19:45 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=3259648)\u001b[0m 14:19:46 INFO - Completed 1 files in 0.004 min\n", + "\u001b[36m(orchestrate pid=3259648)\u001b[0m 14:19:46 INFO - Completed 2 files in 0.004 min\n", + "\u001b[36m(orchestrate pid=3259648)\u001b[0m 14:19:46 INFO - Completed 3 files in 0.004 min\n", + "\u001b[36m(orchestrate pid=3259648)\u001b[0m 14:19:46 INFO - Completed 4 files in 0.004 min\n", + "\u001b[36m(orchestrate pid=3259648)\u001b[0m 14:19:46 INFO - Completed 4 files (66.667%) in 0.004 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=3259648)\u001b[0m 14:19:46 INFO - Completed processing 6 files in 0.005 min\n", + "\u001b[36m(orchestrate pid=3259648)\u001b[0m 14:19:46 INFO - done flushing in 0.001 sec\n", + "14:19:56 INFO - Completed execution in 0.234 min, execution result 0\n" ] }, { @@ -832,8 +834,8 @@ "output_type": "stream", "text": [ "✅ Stage:2 completed successfully\n", - "CPU times: user 122 ms, sys: 132 ms, total: 254 ms\n", - "Wall time: 14.8 s\n" + "CPU times: user 115 ms, sys: 137 ms, total: 251 ms\n", + "Wall time: 15.3 s\n" ] } ], @@ -934,13 +936,13 @@ " 1\n", " 0\n", " 2\n", - " 3618834f-9dfc-49a1-9066-e2724df95fec\n", + " 8dc8970e-215a-44fe-a7bf-946c03f36c60\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 35\n", - " 2025-02-04T23:08:56.820444\n", - " 1.846058\n", + " 2025-02-06T14:19:29.408910\n", + " 1.912304\n", " lorem-ipsum.pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 3\n", @@ -952,13 +954,13 @@ " 1\n", " 0\n", " 2\n", - " 27880888-8e1a-4b46-a6a9-fecba8eee0eb\n", + " 9ac78463-b325-406b-891e-c9e84722eb34\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-02-04T23:08:58.414120\n", - " 1.590731\n", + " 2025-02-06T14:19:30.986464\n", + " 1.573836\n", " spam.pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 5\n", @@ -970,13 +972,13 @@ " 1\n", " 0\n", " 11\n", - " 84f59118-2a64-4d4b-991c-10ca09576a74\n", + " b3ed1942-54a6-49fc-bcbc-2d8c438adef3\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 610\n", - " 2025-02-04T23:08:56.713495\n", - " 1.827202\n", + " 2025-02-06T14:19:29.335271\n", + " 1.850426\n", " earth2.pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 2\n", @@ -988,13 +990,13 @@ " 1\n", " 0\n", " 11\n", - " 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff\n", + " 6d882651-2506-41cb-8704-85575c64b143\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-02-04T23:08:58.272496\n", - " 1.547326\n", + " 2025-02-06T14:19:30.950673\n", + " 1.612200\n", " mars.pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 4\n", @@ -1006,13 +1008,13 @@ " 1\n", " 0\n", " 11\n", - " 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44\n", + " f8ccec16-576c-4e3e-8bec-359dff01d6d2\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-02-04T23:08:54.872145\n", - " 1.864833\n", + " 2025-02-06T14:19:27.470409\n", + " 2.071769\n", " earth-copy.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 1\n", @@ -1024,13 +1026,13 @@ " 1\n", " 0\n", " 11\n", - " 5b12b0e8-946f-4538-8812-9ee74204c2d7\n", + " 18d940f3-f4b4-46ac-9147-077675aead1d\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-02-04T23:08:54.969828\n", - " 1.962273\n", + " 2025-02-06T14:19:27.492574\n", + " 2.093768\n", " earth.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 0\n", @@ -1057,12 +1059,12 @@ "5 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 3618834f-9dfc-49a1-9066-e2724df95fec 6571294142213095721 pdf \n", - "1 27880888-8e1a-4b46-a6a9-fecba8eee0eb 10026122586747302274 pdf \n", - "2 84f59118-2a64-4d4b-991c-10ca09576a74 10729312978404042321 pdf \n", - "3 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff 7758129997476962679 pdf \n", - "4 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44 14711865278795535908 pdf \n", - "5 5b12b0e8-946f-4538-8812-9ee74204c2d7 14711865278795535908 pdf \n", + "0 8dc8970e-215a-44fe-a7bf-946c03f36c60 6571294142213095721 pdf \n", + "1 9ac78463-b325-406b-891e-c9e84722eb34 10026122586747302274 pdf \n", + "2 b3ed1942-54a6-49fc-bcbc-2d8c438adef3 10729312978404042321 pdf \n", + "3 6d882651-2506-41cb-8704-85575c64b143 7758129997476962679 pdf \n", + "4 f8ccec16-576c-4e3e-8bec-359dff01d6d2 14711865278795535908 pdf \n", + "5 18d940f3-f4b4-46ac-9147-077675aead1d 14711865278795535908 pdf \n", "\n", " hash size \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", @@ -1073,12 +1075,12 @@ "5 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", " date_acquired pdf_convert_time source_filename \\\n", - "0 2025-02-04T23:08:56.820444 1.846058 lorem-ipsum.pdf \n", - "1 2025-02-04T23:08:58.414120 1.590731 spam.pdf \n", - "2 2025-02-04T23:08:56.713495 1.827202 earth2.pdf \n", - "3 2025-02-04T23:08:58.272496 1.547326 mars.pdf \n", - "4 2025-02-04T23:08:54.872145 1.864833 earth-copy.pdf \n", - "5 2025-02-04T23:08:54.969828 1.962273 earth.pdf \n", + "0 2025-02-06T14:19:29.408910 1.912304 lorem-ipsum.pdf \n", + "1 2025-02-06T14:19:30.986464 1.573836 spam.pdf \n", + "2 2025-02-06T14:19:29.335271 1.850426 earth2.pdf \n", + "3 2025-02-06T14:19:30.950673 1.612200 mars.pdf \n", + "4 2025-02-06T14:19:27.470409 2.071769 earth-copy.pdf \n", + "5 2025-02-06T14:19:27.492574 2.093768 earth.pdf \n", "\n", " doc_hash int_id_column \n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 \n", @@ -1140,29 +1142,29 @@ "name": "stderr", "output_type": "stream", "text": [ - "23:09:24 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'doc_hash', 'use_snapshot': False, 'snapshot_directory': None, 'hash_cpu': 0.5, 'num_hashes': 2}\n", - "23:09:24 INFO - pipeline id pipeline_id\n", - "23:09:24 INFO - code location None\n", - "23:09:24 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", - "23:09:24 INFO - actor creation delay 0\n", - "23:09:24 INFO - job details {'job category': 'preprocessing', 'job name': 'ededup', 'job type': 'ray', 'job id': 'job_id'}\n", - "23:09:24 INFO - data factory data_ is using local data access: input_folder - output/02_docid_out output_folder - output/03_exact_dedupe_out\n", - "23:09:24 INFO - data factory data_ max_files -1, n_sample -1\n", - "23:09:24 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:09:24 INFO - Running locally\n", - "2025-02-04 23:09:25,887\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:27 INFO - orchestrator started at 2025-02-04 23:09:27\n", - "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:27 INFO - Number of files is 6, source profile {'max_file_size': 0.01116180419921875, 'min_file_size': 0.006641387939453125, 'total_file_size': 0.056290626525878906}\n", - "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:27 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.715737915597856, 'object_store': 4.357868957333267}\n", - "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:27 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:28 INFO - Completed 1 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:28 INFO - Completed 2 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:28 INFO - Completed 3 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:28 INFO - Completed 4 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:28 INFO - Completed 4 files (66.667%) in 0.003 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:28 INFO - Completed processing 6 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:28 INFO - done flushing in 0.001 sec\n", - "23:09:38 INFO - Completed execution in 0.226 min, execution result 0\n" + "14:19:57 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'doc_hash', 'use_snapshot': False, 'snapshot_directory': None, 'hash_cpu': 0.5, 'num_hashes': 2}\n", + "14:19:57 INFO - pipeline id pipeline_id\n", + "14:19:57 INFO - code location None\n", + "14:19:57 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", + "14:19:57 INFO - actor creation delay 0\n", + "14:19:57 INFO - job details {'job category': 'preprocessing', 'job name': 'ededup', 'job type': 'ray', 'job id': 'job_id'}\n", + "14:19:57 INFO - data factory data_ is using local data access: input_folder - output/02_docid_out output_folder - output/03_exact_dedupe_out\n", + "14:19:57 INFO - data factory data_ max_files -1, n_sample -1\n", + "14:19:57 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "14:19:57 INFO - Running locally\n", + "2025-02-06 14:19:58,746\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=3261174)\u001b[0m 14:20:00 INFO - orchestrator started at 2025-02-06 14:20:00\n", + "\u001b[36m(orchestrate pid=3261174)\u001b[0m 14:20:00 INFO - Number of files is 6, source profile {'max_file_size': 0.01116180419921875, 'min_file_size': 0.006641387939453125, 'total_file_size': 0.056290626525878906}\n", + "\u001b[36m(orchestrate pid=3261174)\u001b[0m 14:20:00 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 7.283956146799028, 'object_store': 3.6419780729338527}\n", + "\u001b[36m(orchestrate pid=3261174)\u001b[0m 14:20:00 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=3261174)\u001b[0m 14:20:01 INFO - Completed 1 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3261174)\u001b[0m 14:20:01 INFO - Completed 2 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3261174)\u001b[0m 14:20:01 INFO - Completed 3 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3261174)\u001b[0m 14:20:01 INFO - Completed 4 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3261174)\u001b[0m 14:20:01 INFO - Completed 4 files (66.667%) in 0.003 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=3261174)\u001b[0m 14:20:01 INFO - Completed processing 6 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3261174)\u001b[0m 14:20:01 INFO - done flushing in 0.001 sec\n", + "14:20:11 INFO - Completed execution in 0.225 min, execution result 0\n" ] }, { @@ -1170,8 +1172,8 @@ "output_type": "stream", "text": [ "✅ Stage:3 completed successfully\n", - "CPU times: user 144 ms, sys: 164 ms, total: 308 ms\n", - "Wall time: 14.8 s\n" + "CPU times: user 98.9 ms, sys: 129 ms, total: 228 ms\n", + "Wall time: 15 s\n" ] } ], @@ -1275,13 +1277,13 @@ " 1\n", " 0\n", " 2\n", - " 3618834f-9dfc-49a1-9066-e2724df95fec\n", + " 8dc8970e-215a-44fe-a7bf-946c03f36c60\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 35\n", - " 2025-02-04T23:08:56.820444\n", - " 1.846058\n", + " 2025-02-06T14:19:29.408910\n", + " 1.912304\n", " lorem-ipsum.pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 3\n", @@ -1294,13 +1296,13 @@ " 1\n", " 0\n", " 2\n", - " 27880888-8e1a-4b46-a6a9-fecba8eee0eb\n", + " 9ac78463-b325-406b-891e-c9e84722eb34\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-02-04T23:08:58.414120\n", - " 1.590731\n", + " 2025-02-06T14:19:30.986464\n", + " 1.573836\n", " spam.pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 5\n", @@ -1313,13 +1315,13 @@ " 1\n", " 0\n", " 11\n", - " 84f59118-2a64-4d4b-991c-10ca09576a74\n", + " b3ed1942-54a6-49fc-bcbc-2d8c438adef3\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 610\n", - " 2025-02-04T23:08:56.713495\n", - " 1.827202\n", + " 2025-02-06T14:19:29.335271\n", + " 1.850426\n", " earth2.pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 2\n", @@ -1332,13 +1334,13 @@ " 1\n", " 0\n", " 11\n", - " 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff\n", + " 6d882651-2506-41cb-8704-85575c64b143\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-02-04T23:08:58.272496\n", - " 1.547326\n", + " 2025-02-06T14:19:30.950673\n", + " 1.612200\n", " mars.pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 4\n", @@ -1351,13 +1353,13 @@ " 1\n", " 0\n", " 11\n", - " 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44\n", + " f8ccec16-576c-4e3e-8bec-359dff01d6d2\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-02-04T23:08:54.872145\n", - " 1.864833\n", + " 2025-02-06T14:19:27.470409\n", + " 2.071769\n", " earth-copy.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 1\n", @@ -1383,11 +1385,11 @@ "4 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 3618834f-9dfc-49a1-9066-e2724df95fec 6571294142213095721 pdf \n", - "1 27880888-8e1a-4b46-a6a9-fecba8eee0eb 10026122586747302274 pdf \n", - "2 84f59118-2a64-4d4b-991c-10ca09576a74 10729312978404042321 pdf \n", - "3 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff 7758129997476962679 pdf \n", - "4 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44 14711865278795535908 pdf \n", + "0 8dc8970e-215a-44fe-a7bf-946c03f36c60 6571294142213095721 pdf \n", + "1 9ac78463-b325-406b-891e-c9e84722eb34 10026122586747302274 pdf \n", + "2 b3ed1942-54a6-49fc-bcbc-2d8c438adef3 10729312978404042321 pdf \n", + "3 6d882651-2506-41cb-8704-85575c64b143 7758129997476962679 pdf \n", + "4 f8ccec16-576c-4e3e-8bec-359dff01d6d2 14711865278795535908 pdf \n", "\n", " hash size \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", @@ -1397,11 +1399,11 @@ "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", " date_acquired pdf_convert_time source_filename \\\n", - "0 2025-02-04T23:08:56.820444 1.846058 lorem-ipsum.pdf \n", - "1 2025-02-04T23:08:58.414120 1.590731 spam.pdf \n", - "2 2025-02-04T23:08:56.713495 1.827202 earth2.pdf \n", - "3 2025-02-04T23:08:58.272496 1.547326 mars.pdf \n", - "4 2025-02-04T23:08:54.872145 1.864833 earth-copy.pdf \n", + "0 2025-02-06T14:19:29.408910 1.912304 lorem-ipsum.pdf \n", + "1 2025-02-06T14:19:30.986464 1.573836 spam.pdf \n", + "2 2025-02-06T14:19:29.335271 1.850426 earth2.pdf \n", + "3 2025-02-06T14:19:30.950673 1.612200 mars.pdf \n", + "4 2025-02-06T14:19:27.470409 2.071769 earth-copy.pdf \n", "\n", " doc_hash int_id_column removed \n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] \n", @@ -1486,133 +1488,133 @@ "name": "stderr", "output_type": "stream", "text": [ - "23:09:39 INFO - Starting SignatureCalculation step\n", - "23:09:39 INFO - Got parameters for SignatureCalculation\n", - "23:09:39 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.9, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", - "23:09:39 INFO - data factory scdata_ is using local configuration without input/output path\n", - "23:09:39 INFO - data factory scdata_ max_files -1, n_sample -1\n", - "23:09:39 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:09:39 INFO - pipeline id pipeline_id\n", - "23:09:39 INFO - code location None\n", - "23:09:39 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "23:09:39 INFO - actor creation delay 0\n", - "23:09:39 INFO - job details {'job category': 'preprocessing', 'job name': 'minhash', 'job type': 'ray', 'job id': 'job_id'}\n", - "23:09:39 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", - "23:09:39 INFO - data factory data_ max_files -1, n_sample -1\n", - "23:09:39 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:09:39 INFO - Running locally\n", - "2025-02-04 23:09:40,737\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:41 INFO - orchestrator started at 2025-02-04 23:09:41\n", - "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:41 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", - "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:41 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.727081298828125, 'object_store': 4.3635406494140625}\n", - "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:41 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:43 INFO - Completed 1 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:43 INFO - Completed 2 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:43 INFO - Completed 3 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:43 INFO - Completed 3 files (50.0%) in 0.003 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:43 INFO - Completed processing 6 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:43 INFO - done flushing in 0.026 sec\n", - "\u001b[36m(RayTransformFileProcessor pid=2176344)\u001b[0m 23:09:43 WARNING - table is empty, skipping processing\n", - "\u001b[36m(RayTransformFileProcessor pid=2176344)\u001b[0m 23:09:43 INFO - Starting flush()\n", - "\u001b[36m(RayTransformFileProcessor pid=2176344)\u001b[0m 23:09:43 INFO - Wrote 14 tables with a total size of 6,720 bytes\n", - "23:09:53 INFO - Completed execution in 0.224 min, execution result 0\n", - "\u001b[36m(RayTransformFileProcessor pid=2176343)\u001b[0m 23:09:43 INFO - Starting flush()\u001b[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)\u001b[0m\n", - "\u001b[36m(RayTransformFileProcessor pid=2176343)\u001b[0m 23:09:43 INFO - Wrote 14 tables with a total size of 13,440 bytes\u001b[32m [repeated 2x across cluster]\u001b[0m\n", - "23:09:54 INFO - SignatureCalculation completed successfully\n", - "23:09:54 INFO - Starting ClusterAnalysis step\n", - "23:09:54 INFO - Got parameters for ClusterAnalysis\n", - "23:09:54 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.9, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", - "23:09:54 INFO - pipeline id pipeline_id\n", - "23:09:54 INFO - code location None\n", - "23:09:54 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "23:09:54 INFO - actor creation delay 0\n", - "23:09:54 INFO - job details {'job category': 'preprocessing', 'job name': 'cluster', 'job type': 'ray', 'job id': 'job_id'}\n", - "23:09:54 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/bands output_folder - output/04_fuzzy_dedupe_out/docs_to_remove\n", - "23:09:54 INFO - data factory data_ max_files -1, n_sample -1\n", - "23:09:54 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:09:54 INFO - Running locally\n", - "2025-02-04 23:09:55,736\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:56 INFO - orchestrator started at 2025-02-04 23:09:56\n", - "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:56 INFO - Number of folders is 14\n", - "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:56 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.763642883859575, 'object_store': 4.381821441464126}\n", - "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:56 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 1 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 2 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 3 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 4 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 5 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 6 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 7 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 8 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 9 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 10 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 11 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 11 files (78.571%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed processing 14 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - done flushing in 0.001 sec\n", - "23:10:08 INFO - Completed execution in 0.222 min, execution result 0\n", - "23:10:09 INFO - ClusterAnalysis completed successfully\n", - "23:10:09 INFO - Starting GetDuplicateList step\n", - "23:10:09 INFO - Got parameters for GetDuplicateList\n", - "23:10:09 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", - "23:10:09 INFO - pipeline id pipeline_id\n", - "23:10:09 INFO - code location None\n", - "23:10:09 INFO - number of workers 1 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "23:10:09 INFO - actor creation delay 0\n", - "23:10:09 INFO - job details {'job category': 'preprocessing', 'job name': 'fdlist', 'job type': 'ray', 'job id': 'job_id'}\n", - "23:10:09 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", - "23:10:09 INFO - data factory data_ max_files -1, n_sample -1\n", - "23:10:09 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:10:09 INFO - Running locally\n", - "2025-02-04 23:10:10,430\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=2178590)\u001b[0m 23:10:11 INFO - orchestrator started at 2025-02-04 23:10:11\n", - "\u001b[36m(orchestrate pid=2178590)\u001b[0m 23:10:11 INFO - Number of folders is 1\n", - "\u001b[36m(orchestrate pid=2178590)\u001b[0m 23:10:11 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.75473709218204, 'object_store': 4.3773685451596975}\n", - "\u001b[36m(orchestrate pid=2178590)\u001b[0m 23:10:11 INFO - Number of workers - 1 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=2178590)\u001b[0m 23:10:12 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=2178590)\u001b[0m 23:10:12 INFO - Completed processing 1 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=2178590)\u001b[0m 23:10:12 INFO - done flushing in 0.001 sec\n", - "\u001b[36m(RayTransformFileProcessor pid=2179461)\u001b[0m 23:10:12 INFO - Get Duplicate List for folder docs_to_remove\n", - "\u001b[36m(RayTransformFileProcessor pid=2179461)\u001b[0m 23:10:12 INFO - 0 documents marked as duplicates\n", - "23:10:22 INFO - Completed execution in 0.223 min, execution result 0\n", - "23:10:24 INFO - GetDuplicateList completed successfully\n", - "23:10:24 INFO - Starting DataCleaning step\n", - "23:10:24 INFO - Got parameters for DataCleaning\n", - "23:10:24 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", - "23:10:24 INFO - data factory dcdata_ is using local configuration without input/output path\n", - "23:10:24 INFO - data factory dcdata_ max_files -1, n_sample -1\n", - "23:10:24 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:10:24 INFO - pipeline id pipeline_id\n", - "23:10:24 INFO - code location None\n", - "23:10:24 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "23:10:24 INFO - actor creation delay 0\n", - "23:10:24 INFO - job details {'job category': 'preprocessing', 'job name': 'fdclean', 'job type': 'ray', 'job id': 'job_id'}\n", - "23:10:24 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out/cleaned\n", - "23:10:24 INFO - data factory data_ max_files -1, n_sample -1\n", - "23:10:24 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:10:24 INFO - Running locally\n", - "2025-02-04 23:10:25,111\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:26 INFO - orchestrator started at 2025-02-04 23:10:26\n", - "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:26 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", - "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:26 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.744503784924746, 'object_store': 4.37225189153105}\n", - "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:26 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:27 INFO - Completed 1 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:27 INFO - Completed 2 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:27 INFO - Completed 3 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:27 INFO - Completed 3 files (50.0%) in 0.003 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:27 INFO - Completed processing 6 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:27 INFO - done flushing in 0.001 sec\n", - "\u001b[36m(RayTransformFileProcessor pid=2180888)\u001b[0m 23:10:27 WARNING - table is empty, skipping processing\n", - "23:10:37 INFO - Completed execution in 0.224 min, execution result 0\n", - "23:10:38 INFO - DataCleaning completed successfully\n" + "14:20:12 INFO - Starting SignatureCalculation step\n", + "14:20:12 INFO - Got parameters for SignatureCalculation\n", + "14:20:12 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.9, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", + "14:20:12 INFO - data factory scdata_ is using local configuration without input/output path\n", + "14:20:12 INFO - data factory scdata_ max_files -1, n_sample -1\n", + "14:20:12 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "14:20:12 INFO - pipeline id pipeline_id\n", + "14:20:12 INFO - code location None\n", + "14:20:12 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "14:20:12 INFO - actor creation delay 0\n", + "14:20:12 INFO - job details {'job category': 'preprocessing', 'job name': 'minhash', 'job type': 'ray', 'job id': 'job_id'}\n", + "14:20:12 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", + "14:20:12 INFO - data factory data_ max_files -1, n_sample -1\n", + "14:20:12 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "14:20:12 INFO - Running locally\n", + "2025-02-06 14:20:13,822\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=3262907)\u001b[0m 14:20:15 INFO - orchestrator started at 2025-02-06 14:20:15\n", + "\u001b[36m(orchestrate pid=3262907)\u001b[0m 14:20:15 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", + "\u001b[36m(orchestrate pid=3262907)\u001b[0m 14:20:15 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 7.180192566476762, 'object_store': 3.59009628277272}\n", + "\u001b[36m(orchestrate pid=3262907)\u001b[0m 14:20:15 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=3262907)\u001b[0m 14:20:16 INFO - Completed 1 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3262907)\u001b[0m 14:20:16 INFO - Completed 2 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3262907)\u001b[0m 14:20:16 INFO - Completed 3 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3262907)\u001b[0m 14:20:16 INFO - Completed 3 files (50.0%) in 0.003 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=3262907)\u001b[0m 14:20:16 INFO - Completed processing 6 files in 0.003 min\n", + "\u001b[36m(RayTransformFileProcessor pid=3263786)\u001b[0m 14:20:16 INFO - Starting flush()\n", + "\u001b[36m(RayTransformFileProcessor pid=3263785)\u001b[0m 14:20:16 WARNING - table is empty, skipping processing\n", + "\u001b[36m(orchestrate pid=3262907)\u001b[0m 14:20:16 INFO - done flushing in 0.03 sec\n", + "\u001b[36m(RayTransformFileProcessor pid=3263786)\u001b[0m 14:20:16 INFO - Wrote 14 tables with a total size of 13,440 bytes\n", + "14:20:26 INFO - Completed execution in 0.227 min, execution result 0\n", + "\u001b[36m(RayTransformFileProcessor pid=3263785)\u001b[0m 14:20:16 INFO - Starting flush()\u001b[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)\u001b[0m\n", + "\u001b[36m(RayTransformFileProcessor pid=3263785)\u001b[0m 14:20:16 INFO - Wrote 14 tables with a total size of 13,440 bytes\u001b[32m [repeated 2x across cluster]\u001b[0m\n", + "14:20:27 INFO - SignatureCalculation completed successfully\n", + "14:20:27 INFO - Starting ClusterAnalysis step\n", + "14:20:27 INFO - Got parameters for ClusterAnalysis\n", + "14:20:27 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.9, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", + "14:20:27 INFO - pipeline id pipeline_id\n", + "14:20:27 INFO - code location None\n", + "14:20:27 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "14:20:27 INFO - actor creation delay 0\n", + "14:20:27 INFO - job details {'job category': 'preprocessing', 'job name': 'cluster', 'job type': 'ray', 'job id': 'job_id'}\n", + "14:20:27 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/bands output_folder - output/04_fuzzy_dedupe_out/docs_to_remove\n", + "14:20:27 INFO - data factory data_ max_files -1, n_sample -1\n", + "14:20:27 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "14:20:27 INFO - Running locally\n", + "2025-02-06 14:20:28,857\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=3264596)\u001b[0m 14:20:30 INFO - orchestrator started at 2025-02-06 14:20:30\n", + "\u001b[36m(orchestrate pid=3264596)\u001b[0m 14:20:30 INFO - Number of folders is 14\n", + "\u001b[36m(orchestrate pid=3264596)\u001b[0m 14:20:30 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 7.263226319104433, 'object_store': 3.631613158620894}\n", + "\u001b[36m(orchestrate pid=3264596)\u001b[0m 14:20:30 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=3264596)\u001b[0m 14:20:31 INFO - Completed 1 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=3264596)\u001b[0m 14:20:31 INFO - Completed 2 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=3264596)\u001b[0m 14:20:31 INFO - Completed 3 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=3264596)\u001b[0m 14:20:31 INFO - Completed 4 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=3264596)\u001b[0m 14:20:31 INFO - Completed 5 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=3264596)\u001b[0m 14:20:31 INFO - Completed 6 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=3264596)\u001b[0m 14:20:31 INFO - Completed 7 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=3264596)\u001b[0m 14:20:31 INFO - Completed 8 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=3264596)\u001b[0m 14:20:31 INFO - Completed 9 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=3264596)\u001b[0m 14:20:31 INFO - Completed 10 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=3264596)\u001b[0m 14:20:31 INFO - Completed 11 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=3264596)\u001b[0m 14:20:31 INFO - Completed 11 files (78.571%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=3264596)\u001b[0m 14:20:31 INFO - Completed processing 14 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=3264596)\u001b[0m 14:20:31 INFO - done flushing in 0.001 sec\n", + "14:20:41 INFO - Completed execution in 0.223 min, execution result 0\n", + "14:20:42 INFO - ClusterAnalysis completed successfully\n", + "14:20:42 INFO - Starting GetDuplicateList step\n", + "14:20:42 INFO - Got parameters for GetDuplicateList\n", + "14:20:42 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", + "14:20:42 INFO - pipeline id pipeline_id\n", + "14:20:42 INFO - code location None\n", + "14:20:42 INFO - number of workers 1 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "14:20:42 INFO - actor creation delay 0\n", + "14:20:42 INFO - job details {'job category': 'preprocessing', 'job name': 'fdlist', 'job type': 'ray', 'job id': 'job_id'}\n", + "14:20:42 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", + "14:20:42 INFO - data factory data_ max_files -1, n_sample -1\n", + "14:20:42 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "14:20:42 INFO - Running locally\n", + "2025-02-06 14:20:43,486\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=3266161)\u001b[0m 14:20:44 INFO - orchestrator started at 2025-02-06 14:20:44\n", + "\u001b[36m(orchestrate pid=3266161)\u001b[0m 14:20:44 INFO - Number of folders is 1\n", + "\u001b[36m(orchestrate pid=3266161)\u001b[0m 14:20:44 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 7.259747315198183, 'object_store': 3.629873656667769}\n", + "\u001b[36m(orchestrate pid=3266161)\u001b[0m 14:20:44 INFO - Number of workers - 1 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=3266161)\u001b[0m 14:20:45 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=3266161)\u001b[0m 14:20:45 INFO - Completed processing 1 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=3266161)\u001b[0m 14:20:45 INFO - done flushing in 0.001 sec\n", + "\u001b[36m(RayTransformFileProcessor pid=3267037)\u001b[0m 14:20:45 INFO - Get Duplicate List for folder docs_to_remove\n", + "\u001b[36m(RayTransformFileProcessor pid=3267037)\u001b[0m 14:20:45 INFO - 0 documents marked as duplicates\n", + "14:20:55 INFO - Completed execution in 0.222 min, execution result 0\n", + "14:20:57 INFO - GetDuplicateList completed successfully\n", + "14:20:57 INFO - Starting DataCleaning step\n", + "14:20:57 INFO - Got parameters for DataCleaning\n", + "14:20:57 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", + "14:20:57 INFO - data factory dcdata_ is using local configuration without input/output path\n", + "14:20:57 INFO - data factory dcdata_ max_files -1, n_sample -1\n", + "14:20:57 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "14:20:57 INFO - pipeline id pipeline_id\n", + "14:20:57 INFO - code location None\n", + "14:20:57 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "14:20:57 INFO - actor creation delay 0\n", + "14:20:57 INFO - job details {'job category': 'preprocessing', 'job name': 'fdclean', 'job type': 'ray', 'job id': 'job_id'}\n", + "14:20:57 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out/cleaned\n", + "14:20:57 INFO - data factory data_ max_files -1, n_sample -1\n", + "14:20:57 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "14:20:57 INFO - Running locally\n", + "2025-02-06 14:20:58,292\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=3267588)\u001b[0m 14:20:59 INFO - orchestrator started at 2025-02-06 14:20:59\n", + "\u001b[36m(orchestrate pid=3267588)\u001b[0m 14:20:59 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", + "\u001b[36m(orchestrate pid=3267588)\u001b[0m 14:20:59 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 7.28473205678165, 'object_store': 3.642366027459502}\n", + "\u001b[36m(orchestrate pid=3267588)\u001b[0m 14:20:59 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=3267588)\u001b[0m 14:21:00 INFO - Completed 1 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3267588)\u001b[0m 14:21:00 INFO - Completed 2 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3267588)\u001b[0m 14:21:00 INFO - Completed 3 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3267588)\u001b[0m 14:21:00 INFO - Completed 3 files (50.0%) in 0.003 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=3267588)\u001b[0m 14:21:00 INFO - Completed processing 6 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3267588)\u001b[0m 14:21:00 INFO - done flushing in 0.001 sec\n", + "\u001b[36m(RayTransformFileProcessor pid=3268467)\u001b[0m 14:21:00 WARNING - table is empty, skipping processing\n", + "14:21:10 INFO - Completed execution in 0.226 min, execution result 0\n", + "14:21:12 INFO - DataCleaning completed successfully\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 603 ms, sys: 679 ms, total: 1.28 s\n", - "Wall time: 59.2 s\n" + "CPU times: user 558 ms, sys: 526 ms, total: 1.08 s\n", + "Wall time: 59.5 s\n" ] } ], @@ -1721,13 +1723,13 @@ " 1\n", " 0\n", " 2\n", - " 3618834f-9dfc-49a1-9066-e2724df95fec\n", + " 8dc8970e-215a-44fe-a7bf-946c03f36c60\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 35\n", - " 2025-02-04T23:08:56.820444\n", - " 1.846058\n", + " 2025-02-06T14:19:29.408910\n", + " 1.912304\n", " lorem-ipsum.pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 3\n", @@ -1740,13 +1742,13 @@ " 1\n", " 0\n", " 2\n", - " 27880888-8e1a-4b46-a6a9-fecba8eee0eb\n", + " 9ac78463-b325-406b-891e-c9e84722eb34\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-02-04T23:08:58.414120\n", - " 1.590731\n", + " 2025-02-06T14:19:30.986464\n", + " 1.573836\n", " spam.pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 5\n", @@ -1759,13 +1761,13 @@ " 1\n", " 0\n", " 11\n", - " 84f59118-2a64-4d4b-991c-10ca09576a74\n", + " b3ed1942-54a6-49fc-bcbc-2d8c438adef3\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 610\n", - " 2025-02-04T23:08:56.713495\n", - " 1.827202\n", + " 2025-02-06T14:19:29.335271\n", + " 1.850426\n", " earth2.pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 2\n", @@ -1778,13 +1780,13 @@ " 1\n", " 0\n", " 11\n", - " 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff\n", + " 6d882651-2506-41cb-8704-85575c64b143\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-02-04T23:08:58.272496\n", - " 1.547326\n", + " 2025-02-06T14:19:30.950673\n", + " 1.612200\n", " mars.pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 4\n", @@ -1797,13 +1799,13 @@ " 1\n", " 0\n", " 11\n", - " 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44\n", + " f8ccec16-576c-4e3e-8bec-359dff01d6d2\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-02-04T23:08:54.872145\n", - " 1.864833\n", + " 2025-02-06T14:19:27.470409\n", + " 2.071769\n", " earth-copy.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 1\n", @@ -1829,11 +1831,11 @@ "4 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 3618834f-9dfc-49a1-9066-e2724df95fec 6571294142213095721 pdf \n", - "1 27880888-8e1a-4b46-a6a9-fecba8eee0eb 10026122586747302274 pdf \n", - "2 84f59118-2a64-4d4b-991c-10ca09576a74 10729312978404042321 pdf \n", - "3 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff 7758129997476962679 pdf \n", - "4 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44 14711865278795535908 pdf \n", + "0 8dc8970e-215a-44fe-a7bf-946c03f36c60 6571294142213095721 pdf \n", + "1 9ac78463-b325-406b-891e-c9e84722eb34 10026122586747302274 pdf \n", + "2 b3ed1942-54a6-49fc-bcbc-2d8c438adef3 10729312978404042321 pdf \n", + "3 6d882651-2506-41cb-8704-85575c64b143 7758129997476962679 pdf \n", + "4 f8ccec16-576c-4e3e-8bec-359dff01d6d2 14711865278795535908 pdf \n", "\n", " hash size \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", @@ -1843,11 +1845,11 @@ "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", " date_acquired pdf_convert_time source_filename \\\n", - "0 2025-02-04T23:08:56.820444 1.846058 lorem-ipsum.pdf \n", - "1 2025-02-04T23:08:58.414120 1.590731 spam.pdf \n", - "2 2025-02-04T23:08:56.713495 1.827202 earth2.pdf \n", - "3 2025-02-04T23:08:58.272496 1.547326 mars.pdf \n", - "4 2025-02-04T23:08:54.872145 1.864833 earth-copy.pdf \n", + "0 2025-02-06T14:19:29.408910 1.912304 lorem-ipsum.pdf \n", + "1 2025-02-06T14:19:30.986464 1.573836 spam.pdf \n", + "2 2025-02-06T14:19:29.335271 1.850426 earth2.pdf \n", + "3 2025-02-06T14:19:30.950673 1.612200 mars.pdf \n", + "4 2025-02-06T14:19:27.470409 2.071769 earth-copy.pdf \n", "\n", " doc_hash int_id_column removed \n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] \n", @@ -1918,33 +1920,33 @@ "name": "stderr", "output_type": "stream", "text": [ - "23:10:38 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': '/home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", - "23:10:38 INFO - data factory docq_ is using local configuration without input/output path\n", - "23:10:38 INFO - data factory docq_ max_files -1, n_sample -1\n", - "23:10:38 INFO - data factory docq_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:10:38 INFO - pipeline id pipeline_id\n", - "23:10:38 INFO - code location None\n", - "23:10:38 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", - "23:10:38 INFO - actor creation delay 0\n", - "23:10:38 INFO - job details {'job category': 'preprocessing', 'job name': 'docq', 'job type': 'ray', 'job id': 'job_id'}\n", - "23:10:38 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/cleaned output_folder - output/05_doc_quality_out\n", - "23:10:38 INFO - data factory data_ max_files -1, n_sample -1\n", - "23:10:38 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:10:38 INFO - Running locally\n", - "2025-02-04 23:10:39,863\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:41 INFO - orchestrator started at 2025-02-04 23:10:41\n", - "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:41 INFO - Number of files is 5, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.0069904327392578125, 'total_file_size': 0.04752826690673828}\n", - "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:41 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.727170563302934, 'object_store': 4.363585281185806}\n", - "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:41 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", - "\u001b[36m(RayTransformFileProcessor pid=2182506)\u001b[0m 23:10:41 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n", - "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:42 INFO - Completed 1 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:42 INFO - Completed 2 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:42 INFO - Completed 3 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:42 INFO - Completed 3 files (60.0%) in 0.003 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:42 INFO - Completed processing 5 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:42 INFO - done flushing in 0.001 sec\n", - "23:10:52 INFO - Completed execution in 0.223 min, execution result 0\n", - "\u001b[36m(RayTransformFileProcessor pid=2182507)\u001b[0m 23:10:41 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n" + "14:21:12 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': '/home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", + "14:21:12 INFO - data factory docq_ is using local configuration without input/output path\n", + "14:21:12 INFO - data factory docq_ max_files -1, n_sample -1\n", + "14:21:12 INFO - data factory docq_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "14:21:12 INFO - pipeline id pipeline_id\n", + "14:21:12 INFO - code location None\n", + "14:21:12 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", + "14:21:12 INFO - actor creation delay 0\n", + "14:21:12 INFO - job details {'job category': 'preprocessing', 'job name': 'docq', 'job type': 'ray', 'job id': 'job_id'}\n", + "14:21:12 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/cleaned output_folder - output/05_doc_quality_out\n", + "14:21:12 INFO - data factory data_ max_files -1, n_sample -1\n", + "14:21:12 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "14:21:12 INFO - Running locally\n", + "2025-02-06 14:21:13,443\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=3269230)\u001b[0m 14:21:14 INFO - orchestrator started at 2025-02-06 14:21:14\n", + "\u001b[36m(orchestrate pid=3269230)\u001b[0m 14:21:14 INFO - Number of files is 5, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.0069904327392578125, 'total_file_size': 0.04752826690673828}\n", + "\u001b[36m(orchestrate pid=3269230)\u001b[0m 14:21:14 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 7.264469146728516, 'object_store': 3.632234573364258}\n", + "\u001b[36m(orchestrate pid=3269230)\u001b[0m 14:21:14 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", + "\u001b[36m(RayTransformFileProcessor pid=3270111)\u001b[0m 14:21:15 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n", + "\u001b[36m(orchestrate pid=3269230)\u001b[0m 14:21:16 INFO - Completed 1 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3269230)\u001b[0m 14:21:16 INFO - Completed 2 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3269230)\u001b[0m 14:21:16 INFO - Completed 3 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3269230)\u001b[0m 14:21:16 INFO - Completed 3 files (60.0%) in 0.003 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=3269230)\u001b[0m 14:21:16 INFO - Completed processing 5 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3269230)\u001b[0m 14:21:16 INFO - done flushing in 0.001 sec\n", + "14:21:26 INFO - Completed execution in 0.227 min, execution result 0\n", + "\u001b[36m(RayTransformFileProcessor pid=3270112)\u001b[0m 14:21:15 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n" ] }, { @@ -1952,8 +1954,8 @@ "output_type": "stream", "text": [ "✅ Stage:5 completed successfully\n", - "CPU times: user 116 ms, sys: 125 ms, total: 240 ms\n", - "Wall time: 14.8 s\n" + "CPU times: user 122 ms, sys: 128 ms, total: 250 ms\n", + "Wall time: 14.9 s\n" ] } ], @@ -2065,7 +2067,7 @@ " 1\n", " 0\n", " 2\n", - " 3618834f-9dfc-49a1-9066-e2724df95fec\n", + " 8dc8970e-215a-44fe-a7bf-946c03f36c60\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", @@ -2089,7 +2091,7 @@ " 1\n", " 0\n", " 2\n", - " 27880888-8e1a-4b46-a6a9-fecba8eee0eb\n", + " 9ac78463-b325-406b-891e-c9e84722eb34\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", @@ -2113,7 +2115,7 @@ " 1\n", " 0\n", " 11\n", - " 84f59118-2a64-4d4b-991c-10ca09576a74\n", + " b3ed1942-54a6-49fc-bcbc-2d8c438adef3\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", @@ -2137,7 +2139,7 @@ " 1\n", " 0\n", " 11\n", - " 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff\n", + " 6d882651-2506-41cb-8704-85575c64b143\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", @@ -2161,7 +2163,7 @@ " 1\n", " 0\n", " 11\n", - " 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44\n", + " f8ccec16-576c-4e3e-8bec-359dff01d6d2\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", @@ -2199,11 +2201,11 @@ "4 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 3618834f-9dfc-49a1-9066-e2724df95fec 6571294142213095721 pdf \n", - "1 27880888-8e1a-4b46-a6a9-fecba8eee0eb 10026122586747302274 pdf \n", - "2 84f59118-2a64-4d4b-991c-10ca09576a74 10729312978404042321 pdf \n", - "3 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff 7758129997476962679 pdf \n", - "4 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44 14711865278795535908 pdf \n", + "0 8dc8970e-215a-44fe-a7bf-946c03f36c60 6571294142213095721 pdf \n", + "1 9ac78463-b325-406b-891e-c9e84722eb34 10026122586747302274 pdf \n", + "2 b3ed1942-54a6-49fc-bcbc-2d8c438adef3 10729312978404042321 pdf \n", + "3 6d882651-2506-41cb-8704-85575c64b143 7758129997476962679 pdf \n", + "4 f8ccec16-576c-4e3e-8bec-359dff01d6d2 14711865278795535908 pdf \n", "\n", " hash size ... \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 ... \n", @@ -2326,7 +2328,7 @@ " 1\n", " 0\n", " 11\n", - " 84f59118-2a64-4d4b-991c-10ca09576a74\n", + " b3ed1942-54a6-49fc-bcbc-2d8c438adef3\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", @@ -2350,7 +2352,7 @@ " 1\n", " 0\n", " 11\n", - " 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff\n", + " 6d882651-2506-41cb-8704-85575c64b143\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", @@ -2374,7 +2376,7 @@ " 1\n", " 0\n", " 11\n", - " 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44\n", + " f8ccec16-576c-4e3e-8bec-359dff01d6d2\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", @@ -2408,9 +2410,9 @@ "4 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "2 84f59118-2a64-4d4b-991c-10ca09576a74 10729312978404042321 pdf \n", - "3 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff 7758129997476962679 pdf \n", - "4 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44 14711865278795535908 pdf \n", + "2 b3ed1942-54a6-49fc-bcbc-2d8c438adef3 10729312978404042321 pdf \n", + "3 6d882651-2506-41cb-8704-85575c64b143 7758129997476962679 pdf \n", + "4 f8ccec16-576c-4e3e-8bec-359dff01d6d2 14711865278795535908 pdf \n", "\n", " hash size ... \\\n", "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 ... \n",