Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Chromadb and SQLite #2

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import fitz # PyMuPDF
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.utils import embedding_util

# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path):
text = ""
with fitz.open(pdf_path) as pdf:
for page_num in range(pdf.page_count):
page = pdf[page_num]
text += page.get_text("text") # Extracts text from each page
return text

# Step 2: Generate embeddings using Sentence Transformers
def generate_embeddings(text):
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode([text])
return embeddings[0]

# Step 3: Store embeddings in ChromaDB
def store_in_chromadb(embedding, document_id, metadata):
client = chromadb.Client()
# Create a collection (or get if exists)
collection = client.get_or_create_collection("pdf_embeddings")

# Add the embedding to the collection with associated metadata
collection.add(
documents=[document_id],
embeddings=[embedding.tolist()],
metadatas=[metadata]
)
print("Document stored in ChromaDB with ID:", document_id)

# Main script
pdf_path = "C:\Users\sonar\Downloads\IST645F24R2RPatchala.pdf"
document_id = "doc_1"

metadata = {
"title": "deeptime: an R package that facilitates highly customizable visualizations of 1data over geologicaltime interval",
"author": "William Gearty",
"year": "2024",
}

# Extract text from PDF
text = extract_text_from_pdf(pdf_path)

# Generate embedding for the text
embedding = generate_embeddings(text)

# Store the embedding in ChromaDB
store_in_chromadb(embedding, document_id, metadata)
Binary file added papers.db
Binary file not shown.
92 changes: 92 additions & 0 deletions papers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import sqlite3
import pandas as pd

# Database and Table Initialization
def initialize_db():
conn = sqlite3.connect("papers.db") # Create SQLite database
cursor = conn.cursor()

# Create table if not exists (remove published_date column)
cursor.execute("""
CREATE TABLE IF NOT EXISTS papers (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
title TEXT NOT NULL,
url TEXT NOT NULL
)
""")
conn.commit()
conn.close()

# Load Data from CSV
def load_data_from_csv(csv_file):
conn = sqlite3.connect("papers.db")
cursor = conn.cursor()

# Load the CSV into a pandas DataFrame
df = pd.read_csv(csv_file, encoding='ISO-8859-1')
# Insert data into the SQLite database
for _, row in df.iterrows():
cursor.execute("""
INSERT INTO papers (name, title, url)
VALUES (?, ?, ?)
""", (row["Name"], row["Title"], row["URL"]))

conn.commit()
conn.close()

# Retrieve Title by Name
def get_title_by_name(name):
conn = sqlite3.connect("papers.db")
cursor = conn.cursor()

cursor.execute("SELECT title FROM papers WHERE name = ?", (name,))
result = cursor.fetchone()

conn.close()

if result:
return result[0]
else:
return f"No title found for name: {name}"

# Update a Record
def update_record(name, new_title):
conn = sqlite3.connect("papers.db")
cursor = conn.cursor()

cursor.execute("""
UPDATE papers
SET title = ?
WHERE name = ?
""", (new_title, name))

conn.commit()
conn.close()

# Delete a Record
def delete_record(name):
conn = sqlite3.connect("papers.db")
cursor = conn.cursor()

cursor.execute("DELETE FROM papers WHERE name = ?", (name,))
conn.commit()
conn.close()

# Main Execution
if __name__ == "__main__":
# Initialize the database and table
initialize_db()

# Load data from a CSV file (replace 'papers.csv' with your actual CSV file path)
csv_file = "newpapers.csv"
load_data_from_csv(csv_file)

# Example: Retrieve a paper title by name
print(get_title_by_name("Alice")) # Replace "Alice" with a name from your data

# Example: Update a record
update_record("Alice", "Updated Title")

# Example: Delete a record
delete_record("Bob") # Replace "Bob" with a name from your data