diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml
index 62ae5677d..5c6cd4680 100644
--- a/transforms/pyproject.toml
+++ b/transforms/pyproject.toml
@@ -87,7 +87,8 @@ language = { file = [
"universal/tokenization/requirements.txt",
"universal/web2parquet/requirements.txt",
"universal/profiler/requirements.txt",
-"universal/resize/requirements.txt"
+"universal/resize/requirements.txt",
+"universal/rep_removal/requirements.txt"
]}
# pyproject.toml must be in a parent and cannot be in sibling
diff --git a/transforms/transforms-dev1-testing.ipynb b/transforms/transforms-dev1-testing.ipynb
new file mode 100644
index 000000000..2366dcf5b
--- /dev/null
+++ b/transforms/transforms-dev1-testing.ipynb
@@ -0,0 +1,886 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "e0dfb3c5-7419-48b3-ae05-706ec1829b6e",
+ "metadata": {},
+ "source": [
+ "Assumes that the transforms package has been installaed in the venv and all manipulations required for cargo and rep_removal were done in the vnev"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "8d049f72-9ab5-486b-99d0-70e374c9f656",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/touma/data-prep-kit-pkg/transforms/venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+ " from .autonotebook import tqdm as notebook_tqdm\n"
+ ]
+ }
+ ],
+ "source": [
+ "from huggingface_hub import hf_hub_download\n",
+ "import pyarrow.parquet as pq\n",
+ "import pandas as pd\n",
+ "import os"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "ad36252c-8730-46fe-8882-a6be7c5076c5",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CPU times: user 3.57 s, sys: 4.86 s, total: 8.42 s\n",
+ "Wall time: 51.9 s\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "REPO_ID = \"HuggingFaceFW/fineweb\"\n",
+ "FILENAME = \"data/CC-MAIN-2013-20/000_00000.parquet\"\n",
+ "file1=hf_hub_download(repo_id=REPO_ID, filename=FILENAME, repo_type=\"dataset\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "90ba29c1-6c70-4fba-b700-8dd2630d8b4e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#os.path.dirname(file1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "4204bf13-5af6-4235-9a93-140e181cd3a5",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CPU times: user 4.71 s, sys: 7.07 s, total: 11.8 s\n",
+ "Wall time: 8.42 s\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " text | \n",
+ " id | \n",
+ " dump | \n",
+ " url | \n",
+ " date | \n",
+ " file_path | \n",
+ " language | \n",
+ " language_score | \n",
+ " token_count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " How AP reported in all formats from tornado-st... | \n",
+ " <urn:uuid:d66bc6fe-8477-4adf-b430-f6a558ccc8ff> | \n",
+ " CC-MAIN-2013-20 | \n",
+ " http://%20jwashington@ap.org/Content/Press-Rel... | \n",
+ " 2013-05-18T05:48:54Z | \n",
+ " s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n",
+ " en | \n",
+ " 0.972142 | \n",
+ " 717 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Did you know you have two little yellow, nine-... | \n",
+ " <urn:uuid:803e14c3-dc2e-43d6-b75d-6fb3981c4fe6> | \n",
+ " CC-MAIN-2013-20 | \n",
+ " http://1000awesomethings.com/2012/09/24/934-ad... | \n",
+ " 2013-05-18T08:11:45Z | \n",
+ " s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n",
+ " en | \n",
+ " 0.947991 | \n",
+ " 821 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Car Wash For Clara!\\nNow is your chance to hel... | \n",
+ " <urn:uuid:ac1bbfff-9519-4967-9c64-3dc3a4b471ec> | \n",
+ " CC-MAIN-2013-20 | \n",
+ " http://1027kord.com/car-wash-for-clara/ | \n",
+ " 2013-05-18T06:49:55Z | \n",
+ " s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n",
+ " en | \n",
+ " 0.911518 | \n",
+ " 125 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Listeners Get Sky-high View of Missoula From H... | \n",
+ " <urn:uuid:c1445c58-b111-4c4e-badd-1e43ec317df7> | \n",
+ " CC-MAIN-2013-20 | \n",
+ " http://1075zoofm.com/listeners-get-sky-high-vi... | \n",
+ " 2013-05-18T06:25:20Z | \n",
+ " s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n",
+ " en | \n",
+ " 0.956516 | \n",
+ " 103 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Log In Please enter your ECode to log in.\\nFor... | \n",
+ " <urn:uuid:e5829f7d-b944-4468-9573-61b7cb3078cc> | \n",
+ " CC-MAIN-2013-20 | \n",
+ " http://1105govinfoevents.com/enterprisearchite... | \n",
+ " 2013-05-18T05:27:01Z | \n",
+ " s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n",
+ " en | \n",
+ " 0.798235 | \n",
+ " 75 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 1091391 | \n",
+ " PALMS — The winner of a $7 million SuperLotto ... | \n",
+ " <urn:uuid:9a5989f7-b385-498f-84de-75abc9272805> | \n",
+ " CC-MAIN-2013-20 | \n",
+ " http://www.scpr.org/news/2010/06/06/15880/7m-s... | \n",
+ " 2013-05-22T08:33:55Z | \n",
+ " s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n",
+ " en | \n",
+ " 0.971524 | \n",
+ " 165 | \n",
+ "
\n",
+ " \n",
+ " 1091392 | \n",
+ " Irfan Khan/AFP/Getty Images\\nFormer Bell City ... | \n",
+ " <urn:uuid:b49419dd-bc94-4302-a097-6c544fa0631e> | \n",
+ " CC-MAIN-2013-20 | \n",
+ " http://www.scpr.org/news/2011/03/15/24996/atto... | \n",
+ " 2013-05-22T07:56:02Z | \n",
+ " s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n",
+ " en | \n",
+ " 0.973813 | \n",
+ " 313 | \n",
+ "
\n",
+ " \n",
+ " 1091393 | \n",
+ " A more common sentiment than you would think (... | \n",
+ " <urn:uuid:832b678a-df73-4131-b479-b9fbd3370a6f> | \n",
+ " CC-MAIN-2013-20 | \n",
+ " http://www.scq.ubc.ca/sciencescouts/the-i%E2%8... | \n",
+ " 2013-05-22T07:55:36Z | \n",
+ " s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n",
+ " en | \n",
+ " 0.969990 | \n",
+ " 217 | \n",
+ "
\n",
+ " \n",
+ " 1091394 | \n",
+ " Paper Fashions Boutique is here to save you ti... | \n",
+ " <urn:uuid:1c61271c-9694-4481-aef2-117fea466605> | \n",
+ " CC-MAIN-2013-20 | \n",
+ " http://www.scrapscene.com/2010/08/new-scrapboo... | \n",
+ " 2013-05-22T08:27:53Z | \n",
+ " s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n",
+ " en | \n",
+ " 0.963822 | \n",
+ " 659 | \n",
+ "
\n",
+ " \n",
+ " 1091395 | \n",
+ " Admissions down in Argentina by 7% in first ha... | \n",
+ " <urn:uuid:8759fd30-1bf9-4538-83d1-1195e0d08f93> | \n",
+ " CC-MAIN-2013-20 | \n",
+ " http://www.screendaily.com/admissions-down-in-... | \n",
+ " 2013-05-22T08:13:50Z | \n",
+ " s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n",
+ " en | \n",
+ " 0.925611 | \n",
+ " 252 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1091396 rows × 9 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " text \\\n",
+ "0 How AP reported in all formats from tornado-st... \n",
+ "1 Did you know you have two little yellow, nine-... \n",
+ "2 Car Wash For Clara!\\nNow is your chance to hel... \n",
+ "3 Listeners Get Sky-high View of Missoula From H... \n",
+ "4 Log In Please enter your ECode to log in.\\nFor... \n",
+ "... ... \n",
+ "1091391 PALMS — The winner of a $7 million SuperLotto ... \n",
+ "1091392 Irfan Khan/AFP/Getty Images\\nFormer Bell City ... \n",
+ "1091393 A more common sentiment than you would think (... \n",
+ "1091394 Paper Fashions Boutique is here to save you ti... \n",
+ "1091395 Admissions down in Argentina by 7% in first ha... \n",
+ "\n",
+ " id dump \\\n",
+ "0 CC-MAIN-2013-20 \n",
+ "1 CC-MAIN-2013-20 \n",
+ "2 CC-MAIN-2013-20 \n",
+ "3 CC-MAIN-2013-20 \n",
+ "4 CC-MAIN-2013-20 \n",
+ "... ... ... \n",
+ "1091391 CC-MAIN-2013-20 \n",
+ "1091392 CC-MAIN-2013-20 \n",
+ "1091393 CC-MAIN-2013-20 \n",
+ "1091394 CC-MAIN-2013-20 \n",
+ "1091395 CC-MAIN-2013-20 \n",
+ "\n",
+ " url \\\n",
+ "0 http://%20jwashington@ap.org/Content/Press-Rel... \n",
+ "1 http://1000awesomethings.com/2012/09/24/934-ad... \n",
+ "2 http://1027kord.com/car-wash-for-clara/ \n",
+ "3 http://1075zoofm.com/listeners-get-sky-high-vi... \n",
+ "4 http://1105govinfoevents.com/enterprisearchite... \n",
+ "... ... \n",
+ "1091391 http://www.scpr.org/news/2010/06/06/15880/7m-s... \n",
+ "1091392 http://www.scpr.org/news/2011/03/15/24996/atto... \n",
+ "1091393 http://www.scq.ubc.ca/sciencescouts/the-i%E2%8... \n",
+ "1091394 http://www.scrapscene.com/2010/08/new-scrapboo... \n",
+ "1091395 http://www.screendaily.com/admissions-down-in-... \n",
+ "\n",
+ " date \\\n",
+ "0 2013-05-18T05:48:54Z \n",
+ "1 2013-05-18T08:11:45Z \n",
+ "2 2013-05-18T06:49:55Z \n",
+ "3 2013-05-18T06:25:20Z \n",
+ "4 2013-05-18T05:27:01Z \n",
+ "... ... \n",
+ "1091391 2013-05-22T08:33:55Z \n",
+ "1091392 2013-05-22T07:56:02Z \n",
+ "1091393 2013-05-22T07:55:36Z \n",
+ "1091394 2013-05-22T08:27:53Z \n",
+ "1091395 2013-05-22T08:13:50Z \n",
+ "\n",
+ " file_path language \\\n",
+ "0 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n",
+ "1 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n",
+ "2 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n",
+ "3 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n",
+ "4 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n",
+ "... ... ... \n",
+ "1091391 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n",
+ "1091392 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n",
+ "1091393 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n",
+ "1091394 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n",
+ "1091395 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n",
+ "\n",
+ " language_score token_count \n",
+ "0 0.972142 717 \n",
+ "1 0.947991 821 \n",
+ "2 0.911518 125 \n",
+ "3 0.956516 103 \n",
+ "4 0.798235 75 \n",
+ "... ... ... \n",
+ "1091391 0.971524 165 \n",
+ "1091392 0.973813 313 \n",
+ "1091393 0.969990 217 \n",
+ "1091394 0.963822 659 \n",
+ "1091395 0.925611 252 \n",
+ "\n",
+ "[1091396 rows x 9 columns]"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "import pyarrow.parquet as pq\n",
+ "import pandas as pd\n",
+ "table = pq.read_table(file1)\n",
+ "table.to_pandas()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "b6bbf09e-240d-4017-9bd3-80c809b01d27",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "12:43:08 INFO - Doc id parameters are : {'doc_column': 'text', 'hash_column': 'document_id', 'int_column': 'int_id_column', 'start_id': 5}\n",
+ "12:43:08 INFO - pipeline id pipeline_id\n",
+ "12:43:08 INFO - code location None\n",
+ "12:43:08 INFO - data factory data_ is using local data access: input_folder - /Users/touma/.cache/huggingface/hub/datasets--HuggingFaceFW--fineweb/snapshots/0f039043b23fe1d4eed300b504aa4b4a68f1c7ba/data/CC-MAIN-2013-20 output_folder - files-doc-id\n",
+ "12:43:08 INFO - data factory data_ max_files -1, n_sample -1\n",
+ "12:43:08 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+ "12:43:08 INFO - orchestrator doc_id started at 2025-02-03 12:43:08\n",
+ "12:43:08 INFO - Number of files is 1, source profile {'max_file_size': 2048.0454998016357, 'min_file_size': 2048.0454998016357, 'total_file_size': 2048.0454998016357}\n",
+ "12:43:30 INFO - Completed 1 files (100.0%) in 0.374 min\n",
+ "12:43:30 INFO - Done processing 1 files, waiting for flush() completion.\n",
+ "12:43:30 INFO - done flushing in 0.0 sec\n",
+ "12:43:30 INFO - Completed execution in 0.374 min, execution result 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CPU times: user 17.8 s, sys: 6.27 s, total: 24.1 s\n",
+ "Wall time: 22.5 s\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "from dpk_doc_id.transform_python import DocID\n",
+ "DocID(input_folder= os.path.dirname(file1),\n",
+ " output_folder= \"files-doc-id\",\n",
+ " doc_id_doc_column= \"text\",\n",
+ " doc_id_hash_column= \"document_id\",\n",
+ " doc_id_int_column= \"int_id_column\",\n",
+ " doc_id_start_id= 5).transform()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "2b841fe0-696a-47b9-a93d-683190410710",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#%%time\n",
+ "#import pyarrow.parquet as pq\n",
+ "#import pandas as pd\n",
+ "#table = pq.read_table('files-doc-id/000_00000.parquet')\n",
+ "#table.to_pandas()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "72d7a18b-a218-4cd2-9877-61cfb32fff1a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "12:45:13 INFO - pipeline id pipeline_id\n",
+ "INFO:data_processing.runtime.execution_configuration:pipeline id pipeline_id\n",
+ "12:45:13 INFO - code location None\n",
+ "INFO:data_processing.runtime.execution_configuration:code location None\n",
+ "12:45:13 INFO - data factory data_ is using local data access: input_folder - /Users/touma/.cache/huggingface/hub/datasets--HuggingFaceFW--fineweb/snapshots/0f039043b23fe1d4eed300b504aa4b4a68f1c7ba/data/CC-MAIN-2013-20 output_folder - files-rep_removal\n",
+ "INFO:data_processing.data_access.data_access_factory_base9afa7eae-b98d-4b5e-b07d-cd279ce6afde:data factory data_ is using local data access: input_folder - /Users/touma/.cache/huggingface/hub/datasets--HuggingFaceFW--fineweb/snapshots/0f039043b23fe1d4eed300b504aa4b4a68f1c7ba/data/CC-MAIN-2013-20 output_folder - files-rep_removal\n",
+ "12:45:13 INFO - data factory data_ max_files -1, n_sample -1\n",
+ "INFO:data_processing.data_access.data_access_factory_base9afa7eae-b98d-4b5e-b07d-cd279ce6afde:data factory data_ max_files -1, n_sample -1\n",
+ "12:45:13 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+ "INFO:data_processing.data_access.data_access_factory_base9afa7eae-b98d-4b5e-b07d-cd279ce6afde:data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+ "12:45:13 INFO - orchestrator rep_removal started at 2025-02-03 12:45:13\n",
+ "INFO:data_processing.runtime.pure_python.transform_orchestrator:orchestrator rep_removal started at 2025-02-03 12:45:13\n",
+ "12:45:13 INFO - Number of files is 1, source profile {'max_file_size': 2048.0454998016357, 'min_file_size': 2048.0454998016357, 'total_file_size': 2048.0454998016357}\n",
+ "INFO:data_processing.runtime.pure_python.transform_orchestrator:Number of files is 1, source profile {'max_file_size': 2048.0454998016357, 'min_file_size': 2048.0454998016357, 'total_file_size': 2048.0454998016357}\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "cpu speed: 3504 MHz, Cores: 12\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "INFO:root:timeout is: 35130.8109303653\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "gpu_usage: 0.00%, GPU speed: 0 MHz\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "INFO:root:running the merge\n",
+ "INFO:root:merging complete\n",
+ "\u001b[1m\u001b[33mwarning\u001b[0m\u001b[1m:\u001b[0m no edition set: defaulting to the 2015 edition while the latest is 2021\n",
+ "\u001b[1m\u001b[32m Updating\u001b[0m crates.io index\n",
+ "\u001b[1m\u001b[32m Locking\u001b[0m 48 packages to latest compatible versions\n",
+ "\u001b[1m\u001b[36m Adding\u001b[0m clap v3.2.25 \u001b[1m\u001b[33m(available: v4.5.27)\u001b[0m\n",
+ "\u001b[1m\u001b[36m Adding\u001b[0m crossbeam v0.3.2 \u001b[1m\u001b[33m(available: v0.8.4)\u001b[0m\n",
+ "\u001b[1m\u001b[36m Adding\u001b[0m filebuffer v0.4.0 \u001b[1m\u001b[33m(available: v1.0.0)\u001b[0m\n",
+ "\u001b[1m\u001b[36m Adding\u001b[0m zstd v0.5.4+zstd.1.4.7 \u001b[1m\u001b[33m(available: v0.13.2)\u001b[0m\n",
+ "\u001b[1m\u001b[36m Adding\u001b[0m zstd-sys v1.4.18+zstd.1.4.7 \u001b[1m\u001b[33m(available: v1.6.3+zstd.1.5.2)\u001b[0m\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m libc v0.2.169\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m proc-macro2 v1.0.93\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m version_check v0.9.5\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m shlex v1.3.0\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m either v1.13.0\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m unicode-ident v1.0.16\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m glob v0.3.2\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m syn v1.0.109\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m autocfg v1.4.0\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m zstd-safe v2.0.6+zstd.1.4.7\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m heck v0.4.1\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m hashbrown v0.12.3\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Start load!\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "13:49:30 INFO - Completed 1 files (100.0%) in 64.286 min\n",
+ "INFO:data_processing.runtime.pure_python.transform_orchestrator:Completed 1 files (100.0%) in 64.286 min\n",
+ "13:49:30 INFO - Done processing 1 files, waiting for flush() completion.\n",
+ "INFO:data_processing.runtime.pure_python.transform_orchestrator:Done processing 1 files, waiting for flush() completion.\n",
+ "13:49:30 INFO - done flushing in 0.001 sec\n",
+ "INFO:data_processing.runtime.pure_python.transform_orchestrator:done flushing in 0.001 sec\n",
+ "13:49:30 INFO - Completed execution in 64.287 min, execution result 0\n",
+ "INFO:data_processing.runtime.pure_python.transform_launcher:Completed execution in 64.287 min, execution result 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CPU times: user 43.6 s, sys: 1min 22s, total: 2min 5s\n",
+ "Wall time: 1h 4min 25s\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0 / 1474657457 \n",
+ "1000000000 / 1474657457 \n",
+ "Duplicates found: 21535301\n",
+ "Total time taken: 119206ms\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[1m\u001b[32m Compiling\u001b[0m itertools v0.9.0\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m os_str_bytes v6.6.1\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m textwrap v0.16.1\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m strsim v0.10.0\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m proc-macro-error-attr v1.0.4\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m proc-macro-error v1.0.4\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m indexmap v1.9.3\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m clap_lex v0.2.4\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m termcolor v1.4.1\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m once_cell v1.20.2\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m bitflags v1.3.2\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m crossbeam v0.3.2\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m quote v1.0.38\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m jobserver v0.1.32\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m atty v0.2.14\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m filebuffer v0.4.0\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m cc v1.2.11\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m zstd-sys v1.4.18+zstd.1.4.7\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m clap_derive v3.2.25\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m clap v3.2.25\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m zstd v0.5.4+zstd.1.4.7\n",
+ "\u001b[1m\u001b[32m Compiling\u001b[0m dedup_dataset v1.0.0 (/Users/touma/data-prep-kit-pkg/transforms/venv/lib/python3.11/site-packages/dpk_rep_removal/rust)\n",
+ "\u001b[1m\u001b[32m Finished\u001b[0m `dev` profile [optimized + debuginfo] target(s) in 12.37s\n",
+ "\u001b[1m\u001b[32m Running\u001b[0m `venv/lib/python3.11/site-packages/dpk_rep_removal/rust/target/debug/dedup_dataset self-similar --data-file /var/folders/lb/tysjhggx38l6g9xxg5whzxfc0000gn/T/tmp_h3cw1xg/save_dir/parquet --length-threshold 50 --cache-dir /var/folders/lb/tysjhggx38l6g9xxg5whzxfc0000gn/T/tmp_h3cw1xg/cache --num-threads 1 --frequency-threshold 1 --retain-first-copy`\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "from dpk_rep_removal.runtime import RepRemoval\n",
+ "RepRemoval(input_folder= os.path.dirname(file1),\n",
+ " output_folder= \"files-rep_removal\",\n",
+ " rep_removal_contents_column_name='text', \n",
+ " rep_removal_num_threads='1',\n",
+ " ).transform()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "296200e3-503e-4e5f-92f9-4dd78484c615",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CPU times: user 5.9 s, sys: 4.47 s, total: 10.4 s\n",
+ "Wall time: 11.8 s\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " text | \n",
+ " id | \n",
+ " dump | \n",
+ " url | \n",
+ " date | \n",
+ " file_path | \n",
+ " language | \n",
+ " language_score | \n",
+ " token_count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " How AP reported in all formats from tornado-st... | \n",
+ " <urn:uuid:d66bc6fe-8477-4adf-b430-f6a558ccc8ff> | \n",
+ " CC-MAIN-2013-20 | \n",
+ " http://%20jwashington@ap.org/Content/Press-Rel... | \n",
+ " 2013-05-18T05:48:54Z | \n",
+ " s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n",
+ " en | \n",
+ " 0.972142 | \n",
+ " 717 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Did you know you have two little yellow, nine-... | \n",
+ " <urn:uuid:803e14c3-dc2e-43d6-b75d-6fb3981c4fe6> | \n",
+ " CC-MAIN-2013-20 | \n",
+ " http://1000awesomethings.com/2012/09/24/934-ad... | \n",
+ " 2013-05-18T08:11:45Z | \n",
+ " s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n",
+ " en | \n",
+ " 0.947991 | \n",
+ " 821 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Car Wash For Clara!\\nNow is your chance to hel... | \n",
+ " <urn:uuid:ac1bbfff-9519-4967-9c64-3dc3a4b471ec> | \n",
+ " CC-MAIN-2013-20 | \n",
+ " http://1027kord.com/car-wash-for-clara/ | \n",
+ " 2013-05-18T06:49:55Z | \n",
+ " s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n",
+ " en | \n",
+ " 0.911518 | \n",
+ " 125 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Listeners Get Sky-high View of Missoula From H... | \n",
+ " <urn:uuid:c1445c58-b111-4c4e-badd-1e43ec317df7> | \n",
+ " CC-MAIN-2013-20 | \n",
+ " http://1075zoofm.com/listeners-get-sky-high-vi... | \n",
+ " 2013-05-18T06:25:20Z | \n",
+ " s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n",
+ " en | \n",
+ " 0.956516 | \n",
+ " 103 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Log In Please enter your ECode to log in.\\nFor... | \n",
+ " <urn:uuid:e5829f7d-b944-4468-9573-61b7cb3078cc> | \n",
+ " CC-MAIN-2013-20 | \n",
+ " http://1105govinfoevents.com/enterprisearchite... | \n",
+ " 2013-05-18T05:27:01Z | \n",
+ " s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n",
+ " en | \n",
+ " 0.798235 | \n",
+ " 75 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 1091391 | \n",
+ " PALMS — The winner of a $7 million SuperLotto ... | \n",
+ " <urn:uuid:9a5989f7-b385-498f-84de-75abc9272805> | \n",
+ " CC-MAIN-2013-20 | \n",
+ " http://www.scpr.org/news/2010/06/06/15880/7m-s... | \n",
+ " 2013-05-22T08:33:55Z | \n",
+ " s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n",
+ " en | \n",
+ " 0.971524 | \n",
+ " 165 | \n",
+ "
\n",
+ " \n",
+ " 1091392 | \n",
+ " Irfan Khan/AFP/Getty Images\\nFormer Bell City ... | \n",
+ " <urn:uuid:b49419dd-bc94-4302-a097-6c544fa0631e> | \n",
+ " CC-MAIN-2013-20 | \n",
+ " http://www.scpr.org/news/2011/03/15/24996/atto... | \n",
+ " 2013-05-22T07:56:02Z | \n",
+ " s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n",
+ " en | \n",
+ " 0.973813 | \n",
+ " 313 | \n",
+ "
\n",
+ " \n",
+ " 1091393 | \n",
+ " A more common sentiment than you would think (... | \n",
+ " <urn:uuid:832b678a-df73-4131-b479-b9fbd3370a6f> | \n",
+ " CC-MAIN-2013-20 | \n",
+ " http://www.scq.ubc.ca/sciencescouts/the-i%E2%8... | \n",
+ " 2013-05-22T07:55:36Z | \n",
+ " s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n",
+ " en | \n",
+ " 0.969990 | \n",
+ " 217 | \n",
+ "
\n",
+ " \n",
+ " 1091394 | \n",
+ " Paper Fashions Boutique is here to save you ti... | \n",
+ " <urn:uuid:1c61271c-9694-4481-aef2-117fea466605> | \n",
+ " CC-MAIN-2013-20 | \n",
+ " http://www.scrapscene.com/2010/08/new-scrapboo... | \n",
+ " 2013-05-22T08:27:53Z | \n",
+ " s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n",
+ " en | \n",
+ " 0.963822 | \n",
+ " 659 | \n",
+ "
\n",
+ " \n",
+ " 1091395 | \n",
+ " Admissions down in Argentina by 7% in first ha... | \n",
+ " <urn:uuid:8759fd30-1bf9-4538-83d1-1195e0d08f93> | \n",
+ " CC-MAIN-2013-20 | \n",
+ " http://www.screendaily.com/admissions-down-in-... | \n",
+ " 2013-05-22T08:13:50Z | \n",
+ " s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n",
+ " en | \n",
+ " 0.925611 | \n",
+ " 252 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1091396 rows × 9 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " text \\\n",
+ "0 How AP reported in all formats from tornado-st... \n",
+ "1 Did you know you have two little yellow, nine-... \n",
+ "2 Car Wash For Clara!\\nNow is your chance to hel... \n",
+ "3 Listeners Get Sky-high View of Missoula From H... \n",
+ "4 Log In Please enter your ECode to log in.\\nFor... \n",
+ "... ... \n",
+ "1091391 PALMS — The winner of a $7 million SuperLotto ... \n",
+ "1091392 Irfan Khan/AFP/Getty Images\\nFormer Bell City ... \n",
+ "1091393 A more common sentiment than you would think (... \n",
+ "1091394 Paper Fashions Boutique is here to save you ti... \n",
+ "1091395 Admissions down in Argentina by 7% in first ha... \n",
+ "\n",
+ " id dump \\\n",
+ "0 CC-MAIN-2013-20 \n",
+ "1 CC-MAIN-2013-20 \n",
+ "2 CC-MAIN-2013-20 \n",
+ "3 CC-MAIN-2013-20 \n",
+ "4 CC-MAIN-2013-20 \n",
+ "... ... ... \n",
+ "1091391 CC-MAIN-2013-20 \n",
+ "1091392 CC-MAIN-2013-20 \n",
+ "1091393 CC-MAIN-2013-20 \n",
+ "1091394 CC-MAIN-2013-20 \n",
+ "1091395 CC-MAIN-2013-20 \n",
+ "\n",
+ " url \\\n",
+ "0 http://%20jwashington@ap.org/Content/Press-Rel... \n",
+ "1 http://1000awesomethings.com/2012/09/24/934-ad... \n",
+ "2 http://1027kord.com/car-wash-for-clara/ \n",
+ "3 http://1075zoofm.com/listeners-get-sky-high-vi... \n",
+ "4 http://1105govinfoevents.com/enterprisearchite... \n",
+ "... ... \n",
+ "1091391 http://www.scpr.org/news/2010/06/06/15880/7m-s... \n",
+ "1091392 http://www.scpr.org/news/2011/03/15/24996/atto... \n",
+ "1091393 http://www.scq.ubc.ca/sciencescouts/the-i%E2%8... \n",
+ "1091394 http://www.scrapscene.com/2010/08/new-scrapboo... \n",
+ "1091395 http://www.screendaily.com/admissions-down-in-... \n",
+ "\n",
+ " date \\\n",
+ "0 2013-05-18T05:48:54Z \n",
+ "1 2013-05-18T08:11:45Z \n",
+ "2 2013-05-18T06:49:55Z \n",
+ "3 2013-05-18T06:25:20Z \n",
+ "4 2013-05-18T05:27:01Z \n",
+ "... ... \n",
+ "1091391 2013-05-22T08:33:55Z \n",
+ "1091392 2013-05-22T07:56:02Z \n",
+ "1091393 2013-05-22T07:55:36Z \n",
+ "1091394 2013-05-22T08:27:53Z \n",
+ "1091395 2013-05-22T08:13:50Z \n",
+ "\n",
+ " file_path language \\\n",
+ "0 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n",
+ "1 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n",
+ "2 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n",
+ "3 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n",
+ "4 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n",
+ "... ... ... \n",
+ "1091391 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n",
+ "1091392 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n",
+ "1091393 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n",
+ "1091394 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n",
+ "1091395 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n",
+ "\n",
+ " language_score token_count \n",
+ "0 0.972142 717 \n",
+ "1 0.947991 821 \n",
+ "2 0.911518 125 \n",
+ "3 0.956516 103 \n",
+ "4 0.798235 75 \n",
+ "... ... ... \n",
+ "1091391 0.971524 165 \n",
+ "1091392 0.973813 313 \n",
+ "1091393 0.969990 217 \n",
+ "1091394 0.963822 659 \n",
+ "1091395 0.925611 252 \n",
+ "\n",
+ "[1091396 rows x 9 columns]"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "import pyarrow.parquet as pq\n",
+ "import pandas as pd\n",
+ "table = pq.read_table('files-rep_removal/000_00000.parquet')\n",
+ "table.to_pandas()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e80e2e5a-4318-47bd-a7f0-a446f532e60e",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.10"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}