Skip to content

Commit

Permalink
Fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Oct 10, 2024
1 parent 2864f90 commit 85f2dc6
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 5 deletions.
9 changes: 5 additions & 4 deletions pdelfin/data/convertsilver_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,12 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool):
local_pdf_path = cached_path(s3_path, quiet=True)

from pdelfin.prompts.anchor import get_anchor_text
from pdelfin.data.buildsilver import build_page_query
obj = build_page_query(local_pdf_path, s3_path, page)
# raw_page_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport")

raw_page_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport")

from pdelfin.prompts import build_openai_silver_data_prompt
obj["body"]["messages"][0]["content"][0]["text"] = build_openai_silver_data_prompt(raw_page_text)
# from pdelfin.prompts import build_openai_silver_data_prompt
# obj["body"]["messages"][0]["content"][0]["text"] = build_openai_silver_data_prompt(raw_page_text)

if obj is not None:
outfile.write(json.dumps(obj) + '\n')
Expand Down
2 changes: 1 addition & 1 deletion pdelfin/data/runopenaibatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def get_done_total(folder_path):

# Main function to process all .jsonl files in a folder
def process_folder(folder_path: str, max_gb: int):
output_folder = f"{folder_path}_done"
output_folder = f"{folder_path.rstrip('/')}_done"
os.makedirs(output_folder, exist_ok=True)
last_loop_time = datetime.datetime.now()

Expand Down

0 comments on commit 85f2dc6

Please sign in to comment.