Skip to content

Commit

Permalink
Refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Oct 9, 2024
1 parent 0c56dec commit 4bf6e7a
Show file tree
Hide file tree
Showing 4 changed files with 3 additions and 61 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# ml stuff
wandb/
*histogram.png

/*.html

Expand Down
51 changes: 0 additions & 51 deletions pdelfin/filter/imagedetect.py

This file was deleted.

4 changes: 2 additions & 2 deletions tests/test_dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,8 @@ def testExtractResponse(self):
print(response_data[0])

def testPyArrowDirectJson(self):
query_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_train/*.jsonl"
response_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_train/*.json"
query_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_eval/*.jsonl"
response_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json"

all_files = list_dataset_files(query_glob_path)

Expand Down
8 changes: 0 additions & 8 deletions tests/test_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from pypdf import PdfReader

from pdelfin.filter import PdfFilter
from pdelfin.filter.imagedetect import pdf_page_image_area


class PdfFilterTest(unittest.TestCase):
Expand All @@ -16,10 +15,3 @@ def testFormLaterPages(self):
self.filter._is_form(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "form_on_later_pages.pdf"))
)


class ImageDetectionTest(unittest.TestCase):
def testSlideshowMostlyImages(self):
self.pdf = PdfReader(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "slideshow_mostly_images.pdf"))

for page in range(self.pdf.get_num_pages()):
print(page, pdf_page_image_area(self.pdf, page + 1))

0 comments on commit 4bf6e7a

Please sign in to comment.