Skip to content

Commit

Permalink
assemble
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Oct 10, 2024
1 parent 312847a commit af03358
Showing 1 changed file with 6 additions and 3 deletions.
9 changes: 6 additions & 3 deletions pdelfin/assemblepipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
def build_index(s3_path):
# Hash the s3_path to get a cache key
cache_key = hashlib.sha256(s3_path.encode('utf-8')).hexdigest()
cache_dir = os.path.join('.cache', cache_key)
os.makedirs(cache_dir, exist_ok=True)
db_path = os.path.join(cache_dir, 'index.db')
home_cache_dir = os.path.join(os.path.expanduser('~'), '.cache', 'pdelfin', cache_key)
os.makedirs(home_cache_dir, exist_ok=True)
db_path = os.path.join(home_cache_dir, 'index.db')

# Connect to sqlite and create tables if not exist
print("Building page index at", db_path)
Expand Down Expand Up @@ -137,4 +137,7 @@ def process_jsonl_content(content, s3_path):
parser.add_argument('s3_path', help='The S3 path to process (e.g., s3://bucket/prefix/)')
args = parser.parse_args()

# Step one, build an index of all the pages that were processed
build_index(args.s3_path)


0 comments on commit af03358

Please sign in to comment.