Skip to content

Commit

Permalink
Update all docs at once
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Oct 28, 2024
1 parent 062abff commit a3e7654
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 6 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ dolma_previews/*
s2_previews/*
gnarly_previews/*
s2orc_previews/*
s2orc_previews_3200/*
/*.html


Expand Down
16 changes: 10 additions & 6 deletions pdelfin/birrpipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,12 +206,18 @@ def add_pdf(self, s3_path: str, num_pages: int, status: str = 'pending') -> None
except sqlite3.IntegrityError:
print(f"PDF with s3_path '{s3_path}' already exists.")

def update_pdf_status(self, s3_path: str, new_status: str) -> None:
self.cursor.execute("""
def update_pdf_statuses(self, status_updates: dict[str, str]) -> None:
"""
Update the status of multiple PDFs in the database.
:param status_updates: A dictionary where each key is an s3_path (str) and
each value is the new status (str) for that PDF.
"""
self.cursor.executemany("""
UPDATE pdfs
SET status = ?
WHERE s3_path = ?
""", (new_status, s3_path))
""", [(new_status, s3_path) for s3_path, new_status in status_updates.items()])
self.conn.commit()

def get_pdf(self, s3_path: str) -> Optional[PDFRecord]:
Expand Down Expand Up @@ -569,9 +575,7 @@ def build_dolma_doc(s3_workspace: str, pdf: DatabaseManager.PDFRecord) -> Option

def mark_pdfs_done(s3_workspace: str, dolma_docs: list[dict]):
db = DatabaseManager(s3_workspace, skip_init=True)

for doc in dolma_docs:
db.update_pdf_status(doc["metadata"]["Source-File"], "completed")
db.update_pdf_statuses({doc["metadata"]["Source-File"]: "completed" for doc in dolma_docs})

def get_current_round(s3_workspace: str) -> int:
path = s3_workspace[5:]
Expand Down

0 comments on commit a3e7654

Please sign in to comment.