Skip to content

Commit

Permalink
feat: add remote upload ability and csv upload ability (#289)
Browse files Browse the repository at this point in the history
* feat: add remote upload ability and csv upload ability

* feat: add remote upload ability and csv upload ability

* fix: TS error

* remove unnecessary files
  • Loading branch information
shreyashankar authored Jan 23, 2025
1 parent 70aa7d0 commit 760e856
Show file tree
Hide file tree
Showing 8 changed files with 374 additions and 206 deletions.
4 changes: 2 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,11 @@ lzstring = "^1.0.4"
azure-ai-documentintelligence = { version = "^1.0.0b4", optional = true }
litellm = "^1.51.0"
pydantic = "^2.9.2"
httpx = { version = "^0.28.1", optional = true }

[tool.poetry.extras]
parsing = ["python-docx", "openpyxl", "pydub", "python-pptx", "azure-ai-documentintelligence", "paddlepaddle", "pymupdf"]
server = ["fastapi", "uvicorn", "docling", "azure-ai-formrecognizer", "azure-ai-documentintelligence"]
server = ["fastapi", "uvicorn", "docling", "azure-ai-formrecognizer", "azure-ai-documentintelligence", "httpx"]

[tool.poetry.group.dev.dependencies]
pytest = "^8.3.2"
Expand Down
130 changes: 124 additions & 6 deletions server/app/routes/filesystem.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
from fastapi import APIRouter, UploadFile, File, Form, HTTPException
from fastapi.responses import FileResponse, JSONResponse
from typing import List, Optional
from typing import List, Optional, Union
import os
import yaml
import shutil
import httpx
import json
import csv
from io import StringIO
from pathlib import Path
from server.app.models import PipelineConfigRequest

Expand Down Expand Up @@ -33,19 +37,133 @@ async def check_namespace(namespace: str):
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to check/create namespace: {str(e)}")

def validate_json_content(content: bytes) -> None:
"""Validate that content can be parsed as JSON"""
try:
json.loads(content)
except json.JSONDecodeError as e:
raise HTTPException(status_code=400, detail=f"Invalid JSON format: {str(e)}")

def convert_csv_to_json(csv_content: bytes) -> bytes:
"""Convert CSV content to JSON format"""
try:
# Decode bytes to string and create a StringIO object
csv_string = csv_content.decode('utf-8')
csv_file = StringIO(csv_string)

# Read CSV and convert to list of dictionaries
reader = csv.DictReader(csv_file)
data = list(reader)

if not data:
raise HTTPException(status_code=400, detail="CSV file is empty")

# Convert back to JSON bytes
return json.dumps(data).encode('utf-8')
except UnicodeDecodeError:
raise HTTPException(status_code=400, detail="Invalid CSV encoding")
except csv.Error as e:
raise HTTPException(status_code=400, detail=f"Invalid CSV format: {str(e)}")

def is_likely_csv(content: bytes, filename: str) -> bool:
"""Check if content is likely to be CSV based on content and filename"""
# Check filename extension
if filename.lower().endswith('.csv'):
return True

# If no clear extension, try to detect CSV content
try:
# Take first line and check if it looks like CSV
first_line = content.split(b'\n')[0].decode('utf-8')
# Check if line contains commas and no obvious JSON characters
return ',' in first_line and not any(c in first_line for c in '{}[]')
except:
return False

@router.post("/upload-file")
async def upload_file(file: UploadFile = File(...), namespace: str = Form(...)):
"""Upload a single file to the namespace files directory"""
async def upload_file(
file: Optional[UploadFile] = File(None),
url: Optional[str] = Form(None),
namespace: str = Form(...)
):
"""Upload a file to the namespace files directory, either from a direct upload or a URL"""
try:
if not file and not url:
raise HTTPException(status_code=400, detail="Either file or url must be provided")

upload_dir = get_namespace_dir(namespace) / "files"
upload_dir.mkdir(parents=True, exist_ok=True)

file_path = upload_dir / file.filename
with file_path.open("wb") as f:
shutil.copyfileobj(file.file, f)
if url:
# Get filename from URL or default to dataset.json
filename = url.split("/")[-1] or "dataset.json"

file_path = upload_dir / filename.replace('.csv', '.json')

# Handle URL download
async with httpx.AsyncClient() as client:
async with client.stream(
'GET',
url,
follow_redirects=True,
) as response:
if response.status_code != 200:
raise HTTPException(
status_code=400,
detail=f"Failed to download from URL: {response.status_code}"
)

# Save the file in chunks
content_chunks = []
async for chunk in response.aiter_bytes(chunk_size=8192):
if chunk: # filter out keep-alive new chunks
content_chunks.append(chunk)

# Combine chunks
content = b''.join(content_chunks)

# Check if content is CSV and convert if needed
if is_likely_csv(content, filename):
try:
content = convert_csv_to_json(content)
except HTTPException as e:
raise HTTPException(
status_code=400,
detail=f"Failed to convert CSV to JSON: {str(e.detail)}"
)

# Validate JSON content
validate_json_content(content)

# Write to file
with file_path.open("wb") as f:
f.write(content)
else:
# Handle direct file upload
file_content = await file.read()

# Check if content is CSV and convert if needed
if file.filename.lower().endswith('.csv'):
try:
file_content = convert_csv_to_json(file_content)
except HTTPException as e:
raise HTTPException(
status_code=400,
detail=f"Failed to convert CSV to JSON: {str(e.detail)}"
)

# Validate JSON content
validate_json_content(file_content)

# Always save as .json
file_path = upload_dir / file.filename.replace('.csv', '.json')
with file_path.open("wb") as f:
f.write(file_content)

return {"path": str(file_path)}
except Exception as e:
if isinstance(e, HTTPException):
raise e
raise HTTPException(status_code=500, detail=f"Failed to upload file: {str(e)}")

@router.post("/save-documents")
Expand Down
111 changes: 0 additions & 111 deletions todos.md

This file was deleted.

12 changes: 0 additions & 12 deletions vision.md

This file was deleted.

Loading

0 comments on commit 760e856

Please sign in to comment.