diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml index 62ae5677d..5c6cd4680 100644 --- a/transforms/pyproject.toml +++ b/transforms/pyproject.toml @@ -87,7 +87,8 @@ language = { file = [ "universal/tokenization/requirements.txt", "universal/web2parquet/requirements.txt", "universal/profiler/requirements.txt", -"universal/resize/requirements.txt" +"universal/resize/requirements.txt", +"universal/rep_removal/requirements.txt" ]} # pyproject.toml must be in a parent and cannot be in sibling diff --git a/transforms/transforms-dev1-testing.ipynb b/transforms/transforms-dev1-testing.ipynb new file mode 100644 index 000000000..2366dcf5b --- /dev/null +++ b/transforms/transforms-dev1-testing.ipynb @@ -0,0 +1,886 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e0dfb3c5-7419-48b3-ae05-706ec1829b6e", + "metadata": {}, + "source": [ + "Assumes that the transforms package has been installaed in the venv and all manipulations required for cargo and rep_removal were done in the vnev" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "8d049f72-9ab5-486b-99d0-70e374c9f656", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/touma/data-prep-kit-pkg/transforms/venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "from huggingface_hub import hf_hub_download\n", + "import pyarrow.parquet as pq\n", + "import pandas as pd\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ad36252c-8730-46fe-8882-a6be7c5076c5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 3.57 s, sys: 4.86 s, total: 8.42 s\n", + "Wall time: 51.9 s\n" + ] + } + ], + "source": [ + "%%time\n", + "REPO_ID = \"HuggingFaceFW/fineweb\"\n", + "FILENAME = \"data/CC-MAIN-2013-20/000_00000.parquet\"\n", + "file1=hf_hub_download(repo_id=REPO_ID, filename=FILENAME, repo_type=\"dataset\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "90ba29c1-6c70-4fba-b700-8dd2630d8b4e", + "metadata": {}, + "outputs": [], + "source": [ + "#os.path.dirname(file1)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "4204bf13-5af6-4235-9a93-140e181cd3a5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 4.71 s, sys: 7.07 s, total: 11.8 s\n", + "Wall time: 8.42 s\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textiddumpurldatefile_pathlanguagelanguage_scoretoken_count
0How AP reported in all formats from tornado-st...<urn:uuid:d66bc6fe-8477-4adf-b430-f6a558ccc8ff>CC-MAIN-2013-20http://%20jwashington@ap.org/Content/Press-Rel...2013-05-18T05:48:54Zs3://commoncrawl/crawl-data/CC-MAIN-2013-20/se...en0.972142717
1Did you know you have two little yellow, nine-...<urn:uuid:803e14c3-dc2e-43d6-b75d-6fb3981c4fe6>CC-MAIN-2013-20http://1000awesomethings.com/2012/09/24/934-ad...2013-05-18T08:11:45Zs3://commoncrawl/crawl-data/CC-MAIN-2013-20/se...en0.947991821
2Car Wash For Clara!\\nNow is your chance to hel...<urn:uuid:ac1bbfff-9519-4967-9c64-3dc3a4b471ec>CC-MAIN-2013-20http://1027kord.com/car-wash-for-clara/2013-05-18T06:49:55Zs3://commoncrawl/crawl-data/CC-MAIN-2013-20/se...en0.911518125
3Listeners Get Sky-high View of Missoula From H...<urn:uuid:c1445c58-b111-4c4e-badd-1e43ec317df7>CC-MAIN-2013-20http://1075zoofm.com/listeners-get-sky-high-vi...2013-05-18T06:25:20Zs3://commoncrawl/crawl-data/CC-MAIN-2013-20/se...en0.956516103
4Log In Please enter your ECode to log in.\\nFor...<urn:uuid:e5829f7d-b944-4468-9573-61b7cb3078cc>CC-MAIN-2013-20http://1105govinfoevents.com/enterprisearchite...2013-05-18T05:27:01Zs3://commoncrawl/crawl-data/CC-MAIN-2013-20/se...en0.79823575
..............................
1091391PALMS — The winner of a $7 million SuperLotto ...<urn:uuid:9a5989f7-b385-498f-84de-75abc9272805>CC-MAIN-2013-20http://www.scpr.org/news/2010/06/06/15880/7m-s...2013-05-22T08:33:55Zs3://commoncrawl/crawl-data/CC-MAIN-2013-20/se...en0.971524165
1091392Irfan Khan/AFP/Getty Images\\nFormer Bell City ...<urn:uuid:b49419dd-bc94-4302-a097-6c544fa0631e>CC-MAIN-2013-20http://www.scpr.org/news/2011/03/15/24996/atto...2013-05-22T07:56:02Zs3://commoncrawl/crawl-data/CC-MAIN-2013-20/se...en0.973813313
1091393A more common sentiment than you would think (...<urn:uuid:832b678a-df73-4131-b479-b9fbd3370a6f>CC-MAIN-2013-20http://www.scq.ubc.ca/sciencescouts/the-i%E2%8...2013-05-22T07:55:36Zs3://commoncrawl/crawl-data/CC-MAIN-2013-20/se...en0.969990217
1091394Paper Fashions Boutique is here to save you ti...<urn:uuid:1c61271c-9694-4481-aef2-117fea466605>CC-MAIN-2013-20http://www.scrapscene.com/2010/08/new-scrapboo...2013-05-22T08:27:53Zs3://commoncrawl/crawl-data/CC-MAIN-2013-20/se...en0.963822659
1091395Admissions down in Argentina by 7% in first ha...<urn:uuid:8759fd30-1bf9-4538-83d1-1195e0d08f93>CC-MAIN-2013-20http://www.screendaily.com/admissions-down-in-...2013-05-22T08:13:50Zs3://commoncrawl/crawl-data/CC-MAIN-2013-20/se...en0.925611252
\n", + "

1091396 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " text \\\n", + "0 How AP reported in all formats from tornado-st... \n", + "1 Did you know you have two little yellow, nine-... \n", + "2 Car Wash For Clara!\\nNow is your chance to hel... \n", + "3 Listeners Get Sky-high View of Missoula From H... \n", + "4 Log In Please enter your ECode to log in.\\nFor... \n", + "... ... \n", + "1091391 PALMS — The winner of a $7 million SuperLotto ... \n", + "1091392 Irfan Khan/AFP/Getty Images\\nFormer Bell City ... \n", + "1091393 A more common sentiment than you would think (... \n", + "1091394 Paper Fashions Boutique is here to save you ti... \n", + "1091395 Admissions down in Argentina by 7% in first ha... \n", + "\n", + " id dump \\\n", + "0 CC-MAIN-2013-20 \n", + "1 CC-MAIN-2013-20 \n", + "2 CC-MAIN-2013-20 \n", + "3 CC-MAIN-2013-20 \n", + "4 CC-MAIN-2013-20 \n", + "... ... ... \n", + "1091391 CC-MAIN-2013-20 \n", + "1091392 CC-MAIN-2013-20 \n", + "1091393 CC-MAIN-2013-20 \n", + "1091394 CC-MAIN-2013-20 \n", + "1091395 CC-MAIN-2013-20 \n", + "\n", + " url \\\n", + "0 http://%20jwashington@ap.org/Content/Press-Rel... \n", + "1 http://1000awesomethings.com/2012/09/24/934-ad... \n", + "2 http://1027kord.com/car-wash-for-clara/ \n", + "3 http://1075zoofm.com/listeners-get-sky-high-vi... \n", + "4 http://1105govinfoevents.com/enterprisearchite... \n", + "... ... \n", + "1091391 http://www.scpr.org/news/2010/06/06/15880/7m-s... \n", + "1091392 http://www.scpr.org/news/2011/03/15/24996/atto... \n", + "1091393 http://www.scq.ubc.ca/sciencescouts/the-i%E2%8... \n", + "1091394 http://www.scrapscene.com/2010/08/new-scrapboo... \n", + "1091395 http://www.screendaily.com/admissions-down-in-... \n", + "\n", + " date \\\n", + "0 2013-05-18T05:48:54Z \n", + "1 2013-05-18T08:11:45Z \n", + "2 2013-05-18T06:49:55Z \n", + "3 2013-05-18T06:25:20Z \n", + "4 2013-05-18T05:27:01Z \n", + "... ... \n", + "1091391 2013-05-22T08:33:55Z \n", + "1091392 2013-05-22T07:56:02Z \n", + "1091393 2013-05-22T07:55:36Z \n", + "1091394 2013-05-22T08:27:53Z \n", + "1091395 2013-05-22T08:13:50Z \n", + "\n", + " file_path language \\\n", + "0 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n", + "1 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n", + "2 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n", + "3 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n", + "4 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n", + "... ... ... \n", + "1091391 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n", + "1091392 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n", + "1091393 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n", + "1091394 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n", + "1091395 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n", + "\n", + " language_score token_count \n", + "0 0.972142 717 \n", + "1 0.947991 821 \n", + "2 0.911518 125 \n", + "3 0.956516 103 \n", + "4 0.798235 75 \n", + "... ... ... \n", + "1091391 0.971524 165 \n", + "1091392 0.973813 313 \n", + "1091393 0.969990 217 \n", + "1091394 0.963822 659 \n", + "1091395 0.925611 252 \n", + "\n", + "[1091396 rows x 9 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "import pyarrow.parquet as pq\n", + "import pandas as pd\n", + "table = pq.read_table(file1)\n", + "table.to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "b6bbf09e-240d-4017-9bd3-80c809b01d27", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "12:43:08 INFO - Doc id parameters are : {'doc_column': 'text', 'hash_column': 'document_id', 'int_column': 'int_id_column', 'start_id': 5}\n", + "12:43:08 INFO - pipeline id pipeline_id\n", + "12:43:08 INFO - code location None\n", + "12:43:08 INFO - data factory data_ is using local data access: input_folder - /Users/touma/.cache/huggingface/hub/datasets--HuggingFaceFW--fineweb/snapshots/0f039043b23fe1d4eed300b504aa4b4a68f1c7ba/data/CC-MAIN-2013-20 output_folder - files-doc-id\n", + "12:43:08 INFO - data factory data_ max_files -1, n_sample -1\n", + "12:43:08 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "12:43:08 INFO - orchestrator doc_id started at 2025-02-03 12:43:08\n", + "12:43:08 INFO - Number of files is 1, source profile {'max_file_size': 2048.0454998016357, 'min_file_size': 2048.0454998016357, 'total_file_size': 2048.0454998016357}\n", + "12:43:30 INFO - Completed 1 files (100.0%) in 0.374 min\n", + "12:43:30 INFO - Done processing 1 files, waiting for flush() completion.\n", + "12:43:30 INFO - done flushing in 0.0 sec\n", + "12:43:30 INFO - Completed execution in 0.374 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 17.8 s, sys: 6.27 s, total: 24.1 s\n", + "Wall time: 22.5 s\n" + ] + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "from dpk_doc_id.transform_python import DocID\n", + "DocID(input_folder= os.path.dirname(file1),\n", + " output_folder= \"files-doc-id\",\n", + " doc_id_doc_column= \"text\",\n", + " doc_id_hash_column= \"document_id\",\n", + " doc_id_int_column= \"int_id_column\",\n", + " doc_id_start_id= 5).transform()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "2b841fe0-696a-47b9-a93d-683190410710", + "metadata": {}, + "outputs": [], + "source": [ + "#%%time\n", + "#import pyarrow.parquet as pq\n", + "#import pandas as pd\n", + "#table = pq.read_table('files-doc-id/000_00000.parquet')\n", + "#table.to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "72d7a18b-a218-4cd2-9877-61cfb32fff1a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "12:45:13 INFO - pipeline id pipeline_id\n", + "INFO:data_processing.runtime.execution_configuration:pipeline id pipeline_id\n", + "12:45:13 INFO - code location None\n", + "INFO:data_processing.runtime.execution_configuration:code location None\n", + "12:45:13 INFO - data factory data_ is using local data access: input_folder - /Users/touma/.cache/huggingface/hub/datasets--HuggingFaceFW--fineweb/snapshots/0f039043b23fe1d4eed300b504aa4b4a68f1c7ba/data/CC-MAIN-2013-20 output_folder - files-rep_removal\n", + "INFO:data_processing.data_access.data_access_factory_base9afa7eae-b98d-4b5e-b07d-cd279ce6afde:data factory data_ is using local data access: input_folder - /Users/touma/.cache/huggingface/hub/datasets--HuggingFaceFW--fineweb/snapshots/0f039043b23fe1d4eed300b504aa4b4a68f1c7ba/data/CC-MAIN-2013-20 output_folder - files-rep_removal\n", + "12:45:13 INFO - data factory data_ max_files -1, n_sample -1\n", + "INFO:data_processing.data_access.data_access_factory_base9afa7eae-b98d-4b5e-b07d-cd279ce6afde:data factory data_ max_files -1, n_sample -1\n", + "12:45:13 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "INFO:data_processing.data_access.data_access_factory_base9afa7eae-b98d-4b5e-b07d-cd279ce6afde:data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "12:45:13 INFO - orchestrator rep_removal started at 2025-02-03 12:45:13\n", + "INFO:data_processing.runtime.pure_python.transform_orchestrator:orchestrator rep_removal started at 2025-02-03 12:45:13\n", + "12:45:13 INFO - Number of files is 1, source profile {'max_file_size': 2048.0454998016357, 'min_file_size': 2048.0454998016357, 'total_file_size': 2048.0454998016357}\n", + "INFO:data_processing.runtime.pure_python.transform_orchestrator:Number of files is 1, source profile {'max_file_size': 2048.0454998016357, 'min_file_size': 2048.0454998016357, 'total_file_size': 2048.0454998016357}\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cpu speed: 3504 MHz, Cores: 12\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:timeout is: 35130.8109303653\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "gpu_usage: 0.00%, GPU speed: 0 MHz\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:running the merge\n", + "INFO:root:merging complete\n", + "\u001b[1m\u001b[33mwarning\u001b[0m\u001b[1m:\u001b[0m no edition set: defaulting to the 2015 edition while the latest is 2021\n", + "\u001b[1m\u001b[32m Updating\u001b[0m crates.io index\n", + "\u001b[1m\u001b[32m Locking\u001b[0m 48 packages to latest compatible versions\n", + "\u001b[1m\u001b[36m Adding\u001b[0m clap v3.2.25 \u001b[1m\u001b[33m(available: v4.5.27)\u001b[0m\n", + "\u001b[1m\u001b[36m Adding\u001b[0m crossbeam v0.3.2 \u001b[1m\u001b[33m(available: v0.8.4)\u001b[0m\n", + "\u001b[1m\u001b[36m Adding\u001b[0m filebuffer v0.4.0 \u001b[1m\u001b[33m(available: v1.0.0)\u001b[0m\n", + "\u001b[1m\u001b[36m Adding\u001b[0m zstd v0.5.4+zstd.1.4.7 \u001b[1m\u001b[33m(available: v0.13.2)\u001b[0m\n", + "\u001b[1m\u001b[36m Adding\u001b[0m zstd-sys v1.4.18+zstd.1.4.7 \u001b[1m\u001b[33m(available: v1.6.3+zstd.1.5.2)\u001b[0m\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m libc v0.2.169\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m proc-macro2 v1.0.93\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m version_check v0.9.5\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m shlex v1.3.0\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m either v1.13.0\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m unicode-ident v1.0.16\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m glob v0.3.2\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m syn v1.0.109\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m autocfg v1.4.0\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m zstd-safe v2.0.6+zstd.1.4.7\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m heck v0.4.1\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m hashbrown v0.12.3\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Start load!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "13:49:30 INFO - Completed 1 files (100.0%) in 64.286 min\n", + "INFO:data_processing.runtime.pure_python.transform_orchestrator:Completed 1 files (100.0%) in 64.286 min\n", + "13:49:30 INFO - Done processing 1 files, waiting for flush() completion.\n", + "INFO:data_processing.runtime.pure_python.transform_orchestrator:Done processing 1 files, waiting for flush() completion.\n", + "13:49:30 INFO - done flushing in 0.001 sec\n", + "INFO:data_processing.runtime.pure_python.transform_orchestrator:done flushing in 0.001 sec\n", + "13:49:30 INFO - Completed execution in 64.287 min, execution result 0\n", + "INFO:data_processing.runtime.pure_python.transform_launcher:Completed execution in 64.287 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 43.6 s, sys: 1min 22s, total: 2min 5s\n", + "Wall time: 1h 4min 25s\n" + ] + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 / 1474657457 \n", + "1000000000 / 1474657457 \n", + "Duplicates found: 21535301\n", + "Total time taken: 119206ms\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[1m\u001b[32m Compiling\u001b[0m itertools v0.9.0\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m os_str_bytes v6.6.1\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m textwrap v0.16.1\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m strsim v0.10.0\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m proc-macro-error-attr v1.0.4\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m proc-macro-error v1.0.4\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m indexmap v1.9.3\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m clap_lex v0.2.4\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m termcolor v1.4.1\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m once_cell v1.20.2\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m bitflags v1.3.2\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m crossbeam v0.3.2\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m quote v1.0.38\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m jobserver v0.1.32\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m atty v0.2.14\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m filebuffer v0.4.0\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m cc v1.2.11\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m zstd-sys v1.4.18+zstd.1.4.7\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m clap_derive v3.2.25\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m clap v3.2.25\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m zstd v0.5.4+zstd.1.4.7\n", + "\u001b[1m\u001b[32m Compiling\u001b[0m dedup_dataset v1.0.0 (/Users/touma/data-prep-kit-pkg/transforms/venv/lib/python3.11/site-packages/dpk_rep_removal/rust)\n", + "\u001b[1m\u001b[32m Finished\u001b[0m `dev` profile [optimized + debuginfo] target(s) in 12.37s\n", + "\u001b[1m\u001b[32m Running\u001b[0m `venv/lib/python3.11/site-packages/dpk_rep_removal/rust/target/debug/dedup_dataset self-similar --data-file /var/folders/lb/tysjhggx38l6g9xxg5whzxfc0000gn/T/tmp_h3cw1xg/save_dir/parquet --length-threshold 50 --cache-dir /var/folders/lb/tysjhggx38l6g9xxg5whzxfc0000gn/T/tmp_h3cw1xg/cache --num-threads 1 --frequency-threshold 1 --retain-first-copy`\n" + ] + } + ], + "source": [ + "%%time\n", + "from dpk_rep_removal.runtime import RepRemoval\n", + "RepRemoval(input_folder= os.path.dirname(file1),\n", + " output_folder= \"files-rep_removal\",\n", + " rep_removal_contents_column_name='text', \n", + " rep_removal_num_threads='1',\n", + " ).transform()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "296200e3-503e-4e5f-92f9-4dd78484c615", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 5.9 s, sys: 4.47 s, total: 10.4 s\n", + "Wall time: 11.8 s\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textiddumpurldatefile_pathlanguagelanguage_scoretoken_count
0How AP reported in all formats from tornado-st...<urn:uuid:d66bc6fe-8477-4adf-b430-f6a558ccc8ff>CC-MAIN-2013-20http://%20jwashington@ap.org/Content/Press-Rel...2013-05-18T05:48:54Zs3://commoncrawl/crawl-data/CC-MAIN-2013-20/se...en0.972142717
1Did you know you have two little yellow, nine-...<urn:uuid:803e14c3-dc2e-43d6-b75d-6fb3981c4fe6>CC-MAIN-2013-20http://1000awesomethings.com/2012/09/24/934-ad...2013-05-18T08:11:45Zs3://commoncrawl/crawl-data/CC-MAIN-2013-20/se...en0.947991821
2Car Wash For Clara!\\nNow is your chance to hel...<urn:uuid:ac1bbfff-9519-4967-9c64-3dc3a4b471ec>CC-MAIN-2013-20http://1027kord.com/car-wash-for-clara/2013-05-18T06:49:55Zs3://commoncrawl/crawl-data/CC-MAIN-2013-20/se...en0.911518125
3Listeners Get Sky-high View of Missoula From H...<urn:uuid:c1445c58-b111-4c4e-badd-1e43ec317df7>CC-MAIN-2013-20http://1075zoofm.com/listeners-get-sky-high-vi...2013-05-18T06:25:20Zs3://commoncrawl/crawl-data/CC-MAIN-2013-20/se...en0.956516103
4Log In Please enter your ECode to log in.\\nFor...<urn:uuid:e5829f7d-b944-4468-9573-61b7cb3078cc>CC-MAIN-2013-20http://1105govinfoevents.com/enterprisearchite...2013-05-18T05:27:01Zs3://commoncrawl/crawl-data/CC-MAIN-2013-20/se...en0.79823575
..............................
1091391PALMS — The winner of a $7 million SuperLotto ...<urn:uuid:9a5989f7-b385-498f-84de-75abc9272805>CC-MAIN-2013-20http://www.scpr.org/news/2010/06/06/15880/7m-s...2013-05-22T08:33:55Zs3://commoncrawl/crawl-data/CC-MAIN-2013-20/se...en0.971524165
1091392Irfan Khan/AFP/Getty Images\\nFormer Bell City ...<urn:uuid:b49419dd-bc94-4302-a097-6c544fa0631e>CC-MAIN-2013-20http://www.scpr.org/news/2011/03/15/24996/atto...2013-05-22T07:56:02Zs3://commoncrawl/crawl-data/CC-MAIN-2013-20/se...en0.973813313
1091393A more common sentiment than you would think (...<urn:uuid:832b678a-df73-4131-b479-b9fbd3370a6f>CC-MAIN-2013-20http://www.scq.ubc.ca/sciencescouts/the-i%E2%8...2013-05-22T07:55:36Zs3://commoncrawl/crawl-data/CC-MAIN-2013-20/se...en0.969990217
1091394Paper Fashions Boutique is here to save you ti...<urn:uuid:1c61271c-9694-4481-aef2-117fea466605>CC-MAIN-2013-20http://www.scrapscene.com/2010/08/new-scrapboo...2013-05-22T08:27:53Zs3://commoncrawl/crawl-data/CC-MAIN-2013-20/se...en0.963822659
1091395Admissions down in Argentina by 7% in first ha...<urn:uuid:8759fd30-1bf9-4538-83d1-1195e0d08f93>CC-MAIN-2013-20http://www.screendaily.com/admissions-down-in-...2013-05-22T08:13:50Zs3://commoncrawl/crawl-data/CC-MAIN-2013-20/se...en0.925611252
\n", + "

1091396 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " text \\\n", + "0 How AP reported in all formats from tornado-st... \n", + "1 Did you know you have two little yellow, nine-... \n", + "2 Car Wash For Clara!\\nNow is your chance to hel... \n", + "3 Listeners Get Sky-high View of Missoula From H... \n", + "4 Log In Please enter your ECode to log in.\\nFor... \n", + "... ... \n", + "1091391 PALMS — The winner of a $7 million SuperLotto ... \n", + "1091392 Irfan Khan/AFP/Getty Images\\nFormer Bell City ... \n", + "1091393 A more common sentiment than you would think (... \n", + "1091394 Paper Fashions Boutique is here to save you ti... \n", + "1091395 Admissions down in Argentina by 7% in first ha... \n", + "\n", + " id dump \\\n", + "0 CC-MAIN-2013-20 \n", + "1 CC-MAIN-2013-20 \n", + "2 CC-MAIN-2013-20 \n", + "3 CC-MAIN-2013-20 \n", + "4 CC-MAIN-2013-20 \n", + "... ... ... \n", + "1091391 CC-MAIN-2013-20 \n", + "1091392 CC-MAIN-2013-20 \n", + "1091393 CC-MAIN-2013-20 \n", + "1091394 CC-MAIN-2013-20 \n", + "1091395 CC-MAIN-2013-20 \n", + "\n", + " url \\\n", + "0 http://%20jwashington@ap.org/Content/Press-Rel... \n", + "1 http://1000awesomethings.com/2012/09/24/934-ad... \n", + "2 http://1027kord.com/car-wash-for-clara/ \n", + "3 http://1075zoofm.com/listeners-get-sky-high-vi... \n", + "4 http://1105govinfoevents.com/enterprisearchite... \n", + "... ... \n", + "1091391 http://www.scpr.org/news/2010/06/06/15880/7m-s... \n", + "1091392 http://www.scpr.org/news/2011/03/15/24996/atto... \n", + "1091393 http://www.scq.ubc.ca/sciencescouts/the-i%E2%8... \n", + "1091394 http://www.scrapscene.com/2010/08/new-scrapboo... \n", + "1091395 http://www.screendaily.com/admissions-down-in-... \n", + "\n", + " date \\\n", + "0 2013-05-18T05:48:54Z \n", + "1 2013-05-18T08:11:45Z \n", + "2 2013-05-18T06:49:55Z \n", + "3 2013-05-18T06:25:20Z \n", + "4 2013-05-18T05:27:01Z \n", + "... ... \n", + "1091391 2013-05-22T08:33:55Z \n", + "1091392 2013-05-22T07:56:02Z \n", + "1091393 2013-05-22T07:55:36Z \n", + "1091394 2013-05-22T08:27:53Z \n", + "1091395 2013-05-22T08:13:50Z \n", + "\n", + " file_path language \\\n", + "0 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n", + "1 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n", + "2 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n", + "3 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n", + "4 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n", + "... ... ... \n", + "1091391 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n", + "1091392 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n", + "1091393 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n", + "1091394 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n", + "1091395 s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... en \n", + "\n", + " language_score token_count \n", + "0 0.972142 717 \n", + "1 0.947991 821 \n", + "2 0.911518 125 \n", + "3 0.956516 103 \n", + "4 0.798235 75 \n", + "... ... ... \n", + "1091391 0.971524 165 \n", + "1091392 0.973813 313 \n", + "1091393 0.969990 217 \n", + "1091394 0.963822 659 \n", + "1091395 0.925611 252 \n", + "\n", + "[1091396 rows x 9 columns]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "import pyarrow.parquet as pq\n", + "import pandas as pd\n", + "table = pq.read_table('files-rep_removal/000_00000.parquet')\n", + "table.to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e80e2e5a-4318-47bd-a7f0-a446f532e60e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}