update preprocessing scripts

GAIR-NLP · Jun 22, 2024 · 76804e7 · 76804e7
1 parent 80fcb0d
commit 76804e7
Show file tree

Hide file tree

Showing 36 changed files with 3,476 additions and 0 deletions.
diff --git a/src/preprocessing/arXiv/README.md b/src/preprocessing/arXiv/README.md
@@ -0,0 +1,11 @@
+# arXiv
+
+
+- preparation for cleaning: 
+    - `python file_type_rename_and_move.py`
+    - `python rename_rest_gz_part_tex_file.py`
+    - `python processing_rest_gz_part.py`
+    - `python copy_to_merge_math_tex_data.py`
+
+- cleaning at scale: 
+    - `bash clean_tex_at_scale.sh`
diff --git a/src/preprocessing/arXiv/arxiv-math-categories.txt b/src/preprocessing/arXiv/arxiv-math-categories.txt
@@ -0,0 +1,50 @@
+math.AG
+math.AT
+math.AP
+math.CT
+math.CA
+math.CO
+math.AC
+math.CV
+math.DG
+math.DS
+math.FA
+math.GM
+math.GN
+math.GT
+math.GR
+math.HO
+math.IT
+math.KT
+math.LO
+math.MP
+math.MG
+math.NT
+math.NA
+math.OA
+math.OC
+math.PR
+math.QA
+math.RT
+math.RA
+math.SP
+math.ST
+math.SG
+math-ph
+quant-ph
+cs.CC
+cs.CG
+cs.DM
+cs.DS
+cs.FL
+cs.GT
+cs.LG
+cs.NA
+cs.LO
+q-fin.MF
+stat.CO
+stat.ML
+stat.ME
+stat.OT
+stat.TH
+econ.TH
diff --git a/src/preprocessing/arXiv/clean_tex_at_scale.sh b/src/preprocessing/arXiv/clean_tex_at_scale.sh
@@ -0,0 +1,23 @@
+
+
+
+# python processing_tex_at_scale.py \
+#     --whole_arxiv_cleaning_mode \
+#     --raw_arxiv_tex_files_dir ./case_study \
+#     --cleaned_arxiv_tex_files_save_dir ./cleaned_case_study \
+#     --cleaned_data_version 0.2
+
+
+python processing_tex_at_scale.py \
+    --whole_arxiv_cleaning_mode \
+    --raw_arxiv_tex_files_dir ./merge_math_tex_data \
+    --cleaned_arxiv_tex_files_save_dir ./cleaned_merge_math_tex_data \
+    --cleaned_data_version 0.1 \
+    --cleaning_timeout_seconds 300
+
+
+# python processing_tex_at_scale.py \
+#     --just_a_trial_mode \
+#     --parsing_single_tex_file \
+#     --parsing_single_tex_file_path ./cleaned_merge_math_tex_data/0704.0069.tex \
+#     --output_path None
diff --git a/src/preprocessing/arXiv/copy_to_merge_math_tex_data.py b/src/preprocessing/arXiv/copy_to_merge_math_tex_data.py
@@ -0,0 +1,68 @@
+import os
+from concurrent.futures import ThreadPoolExecutor
+from utils import print_time
+import threading
+import os
+import subprocess
+from tqdm import tqdm
+import zipfile
+import rarfile
+import tarfile
+import random
+
+def obtain_file_type(filename):
+    proc = subprocess.run(['file', filename], stdout=subprocess.PIPE)
+    output = proc.stdout.decode()
+    filetype = output.split(":")[1].strip()
+    print(f"{filename} - {filetype}")
+    return filetype
+
+def convert_file_type_to_extension(file_type):
+    if "gzip compressed data" in file_type:
+        if ".tex" in file_type and ".tar" not in file_type:
+            return ".tex.gz"
+        elif ".tar" in file_type:
+            return ".tar.gz"
+        else:
+            return ".gz"
+    elif "PDF document" in file_type:
+        return ".pdf"
+    elif "latex" in file_type.lower():
+        return ".tex"
+    else:
+        print(file_type)
+        return None
+
+TARGET_DIR_PATH = "./merge_math_tex_data"
+
+
+def copy_files(file_path):
+    if os.path.isfile(file_path):
+        entension = convert_file_type_to_extension(obtain_file_type(file_path))
+        if file_path.endswith(".tex") or entension == ".tex":
+            basename = os.path.basename(file_path)
+            if os.path.exists(os.path.join(TARGET_DIR_PATH, basename)):
+                return 
+            os.system(f"cp {file_path} {TARGET_DIR_PATH}")
+    elif os.path.isdir(file_path):
+        basename = os.path.basename(file_path)
+        if os.path.exists(os.path.join(TARGET_DIR_PATH, basename)):
+            return
+        os.system(f"cp -r {file_path} {TARGET_DIR_PATH}")
+    return
+
+
+folders = ["tex_part", "tex_gz_part", "tar_gz_part", "rest_gz_part"]
+
+futures = []
+with print_time(f"copy files to {TARGET_DIR_PATH}"):
+    with ThreadPoolExecutor(max_workers = 200) as executor:
+        for folder in tqdm(folders):
+            filenames_in_folder = os.listdir(folder)
+            for filename in tqdm(filenames_in_folder):
+                full_path = os.path.join(folder, filename)
+                future = executor.submit(copy_files, full_path)
+                futures.append(future)
+            for future in tqdm(futures):
+                result = future.result()
+        executor.shutdown()
diff --git a/src/preprocessing/arXiv/file_type_check.py b/src/preprocessing/arXiv/file_type_check.py
@@ -0,0 +1,110 @@
+import os
+import subprocess
+from tqdm import tqdm
+import zipfile
+import rarfile
+import tarfile
+import random
+import chardet
+
+def obtain_file_type(filename):
+    proc = subprocess.run(['file', filename], stdout=subprocess.PIPE)
+    output = proc.stdout.decode()
+    filetype = output.split(":")[1].strip()
+    print(f"{filename} - {filetype}")
+    return filetype
+
+# filename = "./0704.0086"
+# obtain_file_type(filename)
+
+file_signatures = {
+    b'\x1f\x8b\x08': 'gz',
+    b'\x42\x5a\x68': 'bz2',
+    b'\x50\x4b\x03\x04': 'zip',  
+    b'%PDF': 'pdf',
+    b'\xEF\xBB\xBF': 'text',
+    b'%!PS': 'ps',
+    b'<!DOCTYPE HTML': 'html',
+    b'\x75\x73\x74\x61\x72': 'tar', # tar file head
+    b'\xEF\xBB\xBF\x00': 'tex'  # tex file head
+}
+
+def obtain_file_type_via_signatures(filename):
+    with open(filename, 'rb') as f:
+        file_start = f.read(10)
+    for signature, filetype in file_signatures.items():
+        if file_start.startswith(signature):
+            print(f'{filename} is {filetype}')
+            return 
+    obtain_file_type(filename)    
+    return 
+
+
+def detect_encoding(file_path):
+
+    with open(file_path, 'rb') as f:
+        encoding = chardet.detect(f.read()).get('encoding')
+    return encoding
+
+
+if __name__ == "__main__":
+    # dir_to_check = "./arxiv-math-data"
+    # all_long_filetypes = []
+    # all_unique_filetypes = []
+    # for i, filename in enumerate(tqdm(os.listdir(dir_to_check))):
+    #     full_path = os.path.join(dir_to_check, filename)
+    #     obtain_file_type_via_signatures(full_path)
+    #     obtain_file_type(full_path)
+    #     print("============")
+    #     if i == 50:
+    #         break
+    #     cur_type = obtain_file_type(full_path)
+    #     all_long_filetypes.append(cur_type)
+    #     try:
+    #         meta_file_type = cur_type.split('"')[1].split(".")[-1]
+    #     except Exception as e:
+    #         print(f"{filename} - {meta_file_type}")
+    #     print(meta_file_type)
+    #     all_unique_filetypes.append(meta_file_type)
+    # all_unique_filetypes = list(set(all_unique_filetypes))
+    # print(f"All unique file types - {len(all_unique_filetypes)}")
+    # print(all_unique_filetypes)
+
+    # path = "./0704.0004"
+    # obtain_file_type_via_signatures(path)
+    # obtain_file_type(path)
+
+    # path = "./arxiv-math-data/0001003"
+    # if zipfile.is_zipfile(path):
+    #     print("zip")
+    # elif rarfile.is_rarfile(path):
+    #     print("rar")
+    # elif tarfile.is_tarfile(path):
+    #     print("tar")
+    # elif os.path.splitext(path)[1] == '.tex':
+    #     print("tex")
+
+    META_DIR = "./tar_gz_part"
+    filenames = os.listdir(META_DIR)
+    sampled_filenames = random.sample(filenames, 30)
+    for filename in sampled_filenames:
+        path = os.path.join(META_DIR, filename)
+        file_type = obtain_file_type(path)
+        # print(detect_encoding(path))
+        if "directory" in file_type:
+            os.system(f"cp -r {path} case_study/")
+
+
+    # with open("./tex_gz_part/0404458.tex", "r") as f:
+    #     content = f.read()
+    # print(content)
+
+# ./tex_gz_part/0404458.tex
+# ./tex_gz_part/2304.04107.tex
+# ./tex_gz_part/1702.06413.tex
+# ./tex_gz_part/1410.2051.tex
+# ./tex_gz_part/1005.3149.tex 
+
+#  ascii, ISO-8859-1, None, Windows-1252, Windows-1254, utf-8, SHIFT_JIS, GB2312
+
+#  ISO-8859-1, Windows-1252, SHIFT_JIS