Skip to content

Commit

Permalink
update preprocessing scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
SinclairCoder committed Jun 22, 2024
1 parent 80fcb0d commit 76804e7
Show file tree
Hide file tree
Showing 36 changed files with 3,476 additions and 0 deletions.
11 changes: 11 additions & 0 deletions src/preprocessing/arXiv/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# arXiv


- preparation for cleaning:
- `python file_type_rename_and_move.py`
- `python rename_rest_gz_part_tex_file.py`
- `python processing_rest_gz_part.py`
- `python copy_to_merge_math_tex_data.py`

- cleaning at scale:
- `bash clean_tex_at_scale.sh`
50 changes: 50 additions & 0 deletions src/preprocessing/arXiv/arxiv-math-categories.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
math.AG
math.AT
math.AP
math.CT
math.CA
math.CO
math.AC
math.CV
math.DG
math.DS
math.FA
math.GM
math.GN
math.GT
math.GR
math.HO
math.IT
math.KT
math.LO
math.MP
math.MG
math.NT
math.NA
math.OA
math.OC
math.PR
math.QA
math.RT
math.RA
math.SP
math.ST
math.SG
math-ph
quant-ph
cs.CC
cs.CG
cs.DM
cs.DS
cs.FL
cs.GT
cs.LG
cs.NA
cs.LO
q-fin.MF
stat.CO
stat.ML
stat.ME
stat.OT
stat.TH
econ.TH
23 changes: 23 additions & 0 deletions src/preprocessing/arXiv/clean_tex_at_scale.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@



# python processing_tex_at_scale.py \
# --whole_arxiv_cleaning_mode \
# --raw_arxiv_tex_files_dir ./case_study \
# --cleaned_arxiv_tex_files_save_dir ./cleaned_case_study \
# --cleaned_data_version 0.2


python processing_tex_at_scale.py \
--whole_arxiv_cleaning_mode \
--raw_arxiv_tex_files_dir ./merge_math_tex_data \
--cleaned_arxiv_tex_files_save_dir ./cleaned_merge_math_tex_data \
--cleaned_data_version 0.1 \
--cleaning_timeout_seconds 300


# python processing_tex_at_scale.py \
# --just_a_trial_mode \
# --parsing_single_tex_file \
# --parsing_single_tex_file_path ./cleaned_merge_math_tex_data/0704.0069.tex \
# --output_path None
68 changes: 68 additions & 0 deletions src/preprocessing/arXiv/copy_to_merge_math_tex_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import os
from concurrent.futures import ThreadPoolExecutor
from utils import print_time
import threading
import os
import subprocess
from tqdm import tqdm
import zipfile
import rarfile
import tarfile
import random

def obtain_file_type(filename):
proc = subprocess.run(['file', filename], stdout=subprocess.PIPE)
output = proc.stdout.decode()
filetype = output.split(":")[1].strip()
print(f"{filename} - {filetype}")
return filetype

def convert_file_type_to_extension(file_type):
if "gzip compressed data" in file_type:
if ".tex" in file_type and ".tar" not in file_type:
return ".tex.gz"
elif ".tar" in file_type:
return ".tar.gz"
else:
return ".gz"
elif "PDF document" in file_type:
return ".pdf"
elif "latex" in file_type.lower():
return ".tex"
else:
print(file_type)
return None

TARGET_DIR_PATH = "./merge_math_tex_data"


def copy_files(file_path):
if os.path.isfile(file_path):
entension = convert_file_type_to_extension(obtain_file_type(file_path))
if file_path.endswith(".tex") or entension == ".tex":
basename = os.path.basename(file_path)
if os.path.exists(os.path.join(TARGET_DIR_PATH, basename)):
return
os.system(f"cp {file_path} {TARGET_DIR_PATH}")
elif os.path.isdir(file_path):
basename = os.path.basename(file_path)
if os.path.exists(os.path.join(TARGET_DIR_PATH, basename)):
return
os.system(f"cp -r {file_path} {TARGET_DIR_PATH}")
return


folders = ["tex_part", "tex_gz_part", "tar_gz_part", "rest_gz_part"]

futures = []
with print_time(f"copy files to {TARGET_DIR_PATH}"):
with ThreadPoolExecutor(max_workers = 200) as executor:
for folder in tqdm(folders):
filenames_in_folder = os.listdir(folder)
for filename in tqdm(filenames_in_folder):
full_path = os.path.join(folder, filename)
future = executor.submit(copy_files, full_path)
futures.append(future)
for future in tqdm(futures):
result = future.result()
executor.shutdown()
110 changes: 110 additions & 0 deletions src/preprocessing/arXiv/file_type_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import os
import subprocess
from tqdm import tqdm
import zipfile
import rarfile
import tarfile
import random
import chardet

def obtain_file_type(filename):
proc = subprocess.run(['file', filename], stdout=subprocess.PIPE)
output = proc.stdout.decode()
filetype = output.split(":")[1].strip()
print(f"{filename} - {filetype}")
return filetype

# filename = "./0704.0086"
# obtain_file_type(filename)

file_signatures = {
b'\x1f\x8b\x08': 'gz',
b'\x42\x5a\x68': 'bz2',
b'\x50\x4b\x03\x04': 'zip',
b'%PDF': 'pdf',
b'\xEF\xBB\xBF': 'text',
b'%!PS': 'ps',
b'<!DOCTYPE HTML': 'html',
b'\x75\x73\x74\x61\x72': 'tar', # tar file head
b'\xEF\xBB\xBF\x00': 'tex' # tex file head
}

def obtain_file_type_via_signatures(filename):
with open(filename, 'rb') as f:
file_start = f.read(10)
for signature, filetype in file_signatures.items():
if file_start.startswith(signature):
print(f'{filename} is {filetype}')
return
obtain_file_type(filename)
return


def detect_encoding(file_path):

with open(file_path, 'rb') as f:
encoding = chardet.detect(f.read()).get('encoding')
return encoding


if __name__ == "__main__":
# dir_to_check = "./arxiv-math-data"
# all_long_filetypes = []
# all_unique_filetypes = []
# for i, filename in enumerate(tqdm(os.listdir(dir_to_check))):
# full_path = os.path.join(dir_to_check, filename)
# obtain_file_type_via_signatures(full_path)
# obtain_file_type(full_path)
# print("============")
# if i == 50:
# break
# cur_type = obtain_file_type(full_path)
# all_long_filetypes.append(cur_type)
# try:
# meta_file_type = cur_type.split('"')[1].split(".")[-1]
# except Exception as e:
# print(f"{filename} - {meta_file_type}")
# print(meta_file_type)
# all_unique_filetypes.append(meta_file_type)
# all_unique_filetypes = list(set(all_unique_filetypes))
# print(f"All unique file types - {len(all_unique_filetypes)}")
# print(all_unique_filetypes)

# path = "./0704.0004"
# obtain_file_type_via_signatures(path)
# obtain_file_type(path)

# path = "./arxiv-math-data/0001003"
# if zipfile.is_zipfile(path):
# print("zip")
# elif rarfile.is_rarfile(path):
# print("rar")
# elif tarfile.is_tarfile(path):
# print("tar")
# elif os.path.splitext(path)[1] == '.tex':
# print("tex")

META_DIR = "./tar_gz_part"
filenames = os.listdir(META_DIR)
sampled_filenames = random.sample(filenames, 30)
for filename in sampled_filenames:
path = os.path.join(META_DIR, filename)
file_type = obtain_file_type(path)
# print(detect_encoding(path))
if "directory" in file_type:
os.system(f"cp -r {path} case_study/")


# with open("./tex_gz_part/0404458.tex", "r") as f:
# content = f.read()
# print(content)

# ./tex_gz_part/0404458.tex
# ./tex_gz_part/2304.04107.tex
# ./tex_gz_part/1702.06413.tex
# ./tex_gz_part/1410.2051.tex
# ./tex_gz_part/1005.3149.tex

# ascii, ISO-8859-1, None, Windows-1252, Windows-1254, utf-8, SHIFT_JIS, GB2312

# ISO-8859-1, Windows-1252, SHIFT_JIS
Loading

0 comments on commit 76804e7

Please sign in to comment.