Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Nov 21, 2024
2 parents 212d391 + b8b786e commit 66fff4f
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 1 deletion.
9 changes: 9 additions & 0 deletions pdelfin/beakerpipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from pdelfin.s3_queue import S3WorkQueue, WorkItem
from pdelfin.s3_utils import expand_s3_glob, get_s3_bytes, get_s3_bytes_with_backoff, parse_s3_path, download_zstd_csv, upload_zstd_csv, download_directory
from pdelfin.data.renderpdf import render_pdf_to_base64png
from pdelfin.filter.filter import PdfFilter, Language
from pdelfin.prompts import build_finetuning_prompt, PageResponse
from pdelfin.prompts.anchor import get_anchor_text
from pdelfin.check import check_poppler_version
Expand Down Expand Up @@ -71,6 +72,9 @@
# Process pool for offloading cpu bound work, like calculating anchor texts
process_pool = ProcessPoolExecutor()

# Filter object so we don't reload it's language detect model each time
pdf_filter = PdfFilter(languages_to_keep={Language.ENGLISH, None}, apply_download_spam_check=True, apply_form_check=True)

SGLANG_SERVER_PORT = 30024

@dataclass(frozen=True)
Expand Down Expand Up @@ -234,6 +238,10 @@ async def process_pdf(args, session: httpx.AsyncClient, worker_id: int, pdf_s3_p

logger.info(f"Got {num_pages} pages to do for {pdf_s3_path} in worker {worker_id}")

if args.apply_filter and pdf_filter.filter_out_pdf(tf.name):
logger.info(f"Filtering out pdf {pdf_s3_path}")
return None

# List to hold the tasks for processing each page
page_tasks = []
page_results = []
Expand Down Expand Up @@ -704,6 +712,7 @@ async def main():
parser.add_argument('--max_page_retries', type=int, default=8, help='Max number of times we will retry rendering a page')
parser.add_argument('--max_page_error_rate', type=float, default=0.004, help='Rate of allowable failed pages in a document, 1/250 by default')
parser.add_argument('--workers', type=int, default=8, help='Number of workers to run at a time')
parser.add_argument('--apply_filter', action='store_true', help='Apply basic filtering to English pdfs which are not forms, and not likely seo spam')
parser.add_argument('--stats', action='store_true', help='Instead of running any job, reports some statistics about the current workspace')

# Model parameters
Expand Down
2 changes: 1 addition & 1 deletion pdelfin/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
_MINOR = "1"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "38"
_PATCH = "39"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""
Expand Down

0 comments on commit 66fff4f

Please sign in to comment.