From 760e856a37a6bcb9603abb309c5c53854f5c50ff Mon Sep 17 00:00:00 2001
From: Shreya Shankar <ss.shankar505@gmail.com>
Date: Thu, 23 Jan 2025 09:45:56 +0100
Subject: [PATCH] feat: add remote upload ability and csv upload ability (#289)

* feat: add remote upload ability and csv upload ability

* feat: add remote upload ability and csv upload ability

* fix: TS error

* remove unnecessary files
---
 poetry.lock                                |   4 +-
 pyproject.toml                             |   3 +-
 server/app/routes/filesystem.py            | 130 ++++++++++++++-
 todos.md                                   | 111 -------------
 vision.md                                  |  12 --
 website/src/components/FileExplorer.tsx    | 183 ++++++++++++++-------
 website/src/components/TutorialsDialog.tsx |   4 +-
 website/src/hooks/useDatasetUpload.ts      | 133 +++++++++++++--
 8 files changed, 374 insertions(+), 206 deletions(-)
 delete mode 100644 todos.md
 delete mode 100644 vision.md

diff --git a/poetry.lock b/poetry.lock
index 9681062b..7c0db433 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -5328,9 +5328,9 @@ type = ["pytest-mypy"]
 
 [extras]
 parsing = ["azure-ai-documentintelligence", "openpyxl", "paddlepaddle", "pydub", "pymupdf", "python-docx", "python-pptx"]
-server = ["azure-ai-documentintelligence", "azure-ai-formrecognizer", "docling", "fastapi", "uvicorn"]
+server = ["azure-ai-documentintelligence", "azure-ai-formrecognizer", "docling", "fastapi", "httpx", "uvicorn"]
 
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "a7ff2aeefcd400baf5f7204c1a0875b9c4aa010014b3c8a0c73300db984387d6"
+content-hash = "d97320f86d224de38d08d74c519242f4ad07dd39988a0abd606597c9d8e40254"
diff --git a/pyproject.toml b/pyproject.toml
index 632e3d99..1e06b5ba 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,10 +40,11 @@ lzstring = "^1.0.4"
 azure-ai-documentintelligence = { version = "^1.0.0b4", optional = true }
 litellm = "^1.51.0"
 pydantic = "^2.9.2"
+httpx = { version = "^0.28.1", optional = true }
 
 [tool.poetry.extras]
 parsing = ["python-docx", "openpyxl", "pydub", "python-pptx", "azure-ai-documentintelligence", "paddlepaddle", "pymupdf"]
-server = ["fastapi", "uvicorn", "docling", "azure-ai-formrecognizer", "azure-ai-documentintelligence"]
+server = ["fastapi", "uvicorn", "docling", "azure-ai-formrecognizer", "azure-ai-documentintelligence", "httpx"]
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.3.2"
diff --git a/server/app/routes/filesystem.py b/server/app/routes/filesystem.py
index e33a1709..4ef0ae55 100644
--- a/server/app/routes/filesystem.py
+++ b/server/app/routes/filesystem.py
@@ -1,9 +1,13 @@
 from fastapi import APIRouter, UploadFile, File, Form, HTTPException
 from fastapi.responses import FileResponse, JSONResponse
-from typing import List, Optional
+from typing import List, Optional, Union
 import os
 import yaml
 import shutil
+import httpx
+import json
+import csv
+from io import StringIO
 from pathlib import Path
 from server.app.models import PipelineConfigRequest
 
@@ -33,19 +37,133 @@ async def check_namespace(namespace: str):
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Failed to check/create namespace: {str(e)}")
 
+def validate_json_content(content: bytes) -> None:
+    """Validate that content can be parsed as JSON"""
+    try:
+        json.loads(content)
+    except json.JSONDecodeError as e:
+        raise HTTPException(status_code=400, detail=f"Invalid JSON format: {str(e)}")
+
+def convert_csv_to_json(csv_content: bytes) -> bytes:
+    """Convert CSV content to JSON format"""
+    try:
+        # Decode bytes to string and create a StringIO object
+        csv_string = csv_content.decode('utf-8')
+        csv_file = StringIO(csv_string)
+        
+        # Read CSV and convert to list of dictionaries
+        reader = csv.DictReader(csv_file)
+        data = list(reader)
+        
+        if not data:
+            raise HTTPException(status_code=400, detail="CSV file is empty")
+            
+        # Convert back to JSON bytes
+        return json.dumps(data).encode('utf-8')
+    except UnicodeDecodeError:
+        raise HTTPException(status_code=400, detail="Invalid CSV encoding")
+    except csv.Error as e:
+        raise HTTPException(status_code=400, detail=f"Invalid CSV format: {str(e)}")
+
+def is_likely_csv(content: bytes, filename: str) -> bool:
+    """Check if content is likely to be CSV based on content and filename"""
+    # Check filename extension
+    if filename.lower().endswith('.csv'):
+        return True
+        
+    # If no clear extension, try to detect CSV content
+    try:
+        # Take first line and check if it looks like CSV
+        first_line = content.split(b'\n')[0].decode('utf-8')
+        # Check if line contains commas and no obvious JSON characters
+        return ',' in first_line and not any(c in first_line for c in '{}[]')
+    except:
+        return False
+
 @router.post("/upload-file")
-async def upload_file(file: UploadFile = File(...), namespace: str = Form(...)):
-    """Upload a single file to the namespace files directory"""
+async def upload_file(
+    file: Optional[UploadFile] = File(None),
+    url: Optional[str] = Form(None),
+    namespace: str = Form(...)
+):
+    """Upload a file to the namespace files directory, either from a direct upload or a URL"""
     try:
+        if not file and not url:
+            raise HTTPException(status_code=400, detail="Either file or url must be provided")
+            
         upload_dir = get_namespace_dir(namespace) / "files"
         upload_dir.mkdir(parents=True, exist_ok=True)
         
-        file_path = upload_dir / file.filename
-        with file_path.open("wb") as f:
-            shutil.copyfileobj(file.file, f)
+        if url:
+            # Get filename from URL or default to dataset.json
+            filename = url.split("/")[-1] or "dataset.json"
+            
+            file_path = upload_dir / filename.replace('.csv', '.json')
+            
+            # Handle URL download
+            async with httpx.AsyncClient() as client:
+                async with client.stream(
+                    'GET',
+                    url,
+                    follow_redirects=True,
+                ) as response:
+                    if response.status_code != 200:
+                        raise HTTPException(
+                            status_code=400,
+                            detail=f"Failed to download from URL: {response.status_code}"
+                        )
+                    
+                    # Save the file in chunks
+                    content_chunks = []
+                    async for chunk in response.aiter_bytes(chunk_size=8192):
+                        if chunk:  # filter out keep-alive new chunks
+                            content_chunks.append(chunk)
+                    
+                    # Combine chunks
+                    content = b''.join(content_chunks)
+                    
+                    # Check if content is CSV and convert if needed
+                    if is_likely_csv(content, filename):
+                        try:
+                            content = convert_csv_to_json(content)
+                        except HTTPException as e:
+                            raise HTTPException(
+                                status_code=400,
+                                detail=f"Failed to convert CSV to JSON: {str(e.detail)}"
+                            )
+                    
+                    # Validate JSON content
+                    validate_json_content(content)
+                    
+                    # Write to file
+                    with file_path.open("wb") as f:
+                        f.write(content)
+        else:
+            # Handle direct file upload
+            file_content = await file.read()
+            
+            # Check if content is CSV and convert if needed
+            if file.filename.lower().endswith('.csv'):
+                try:
+                    file_content = convert_csv_to_json(file_content)
+                except HTTPException as e:
+                    raise HTTPException(
+                        status_code=400,
+                        detail=f"Failed to convert CSV to JSON: {str(e.detail)}"
+                    )
+            
+            # Validate JSON content
+            validate_json_content(file_content)
+            
+            # Always save as .json
+            file_path = upload_dir / file.filename.replace('.csv', '.json')
+            with file_path.open("wb") as f:
+                f.write(file_content)
             
         return {"path": str(file_path)}
     except Exception as e:
+        if isinstance(e, HTTPException):
+            raise e
         raise HTTPException(status_code=500, detail=f"Failed to upload file: {str(e)}")
 
 @router.post("/save-documents")
diff --git a/todos.md b/todos.md
deleted file mode 100644
index 89f9c24b..00000000
--- a/todos.md
+++ /dev/null
@@ -1,111 +0,0 @@
-TODO:
-
-- [x] Think about fuzzy matching for reduce / entity resolution
-- [x] Support equijoins
-- [x] Inputs should be accessed as input['title'] instead of just title / everything be jinja
-- [x] Make flatten a separate operator with flatten_key (or nothing)
-- [x] Convert parallel flatmap to parallel map
-- [x] Write documentation & restructure codebase
-- [x] Write tests
-- [x] Chunking/splitting with peripheral chunks
-- [x] Write build phase
-- [x] Add keys / inputs to reduce
-- [x] For reduce we should pass through keys
-- [x] Optimize maps
-  - [x] Track costs for the optimizer
-  - [x] Generate multiple plans and evaluate them instead of generating one plan
-  - [x] Don't use an LLM to determine the right chunk size; try searching many chunk sizes
-  - [x] Call llm agent multiple times on different random inputs & average results
-  - [x] Decompose map to be a chain or parallel map
-  - [ ] Debug finnicky combine prompts
-- [x] Optimize resolvers (add blocking rules)
-- [x] Optimize reduce
-  - [x] Implement fold pattern
-  - [x] Optimize folds
-    - [x] Stratified sample the reduce operations based on the groupby results
-    - [x] Synthesize multiple fold prompts
-  - [x] Implement merge pattern
-  - [ ] Optimize merges
-    - [x] Synthesize merge prompts
-    - [x] Derive num_parallel_folds in the reduce op itself (saving the runtimes of folds and merges)
-    - [ ] Try various batch sizes
-- [x] Optimize equijoins
-- [x] Support multiple operator workflows in the optimizer
-  - [x] Calculate explosion factor
-  - [x] Incorporate selectivity estimates in the multi-operator optimization
-- [x] Write gleaning optimization step
-  - [ ] Incorporate gleaning in reduce
-- [x] Support a summary type reduce, where we sample k elements to put in the prompt do a batch reduce
-- [x] Write a non-associative reduce
-- [x] Write documentation on how all the operators work
-- [x] Auto-generate resolver
-- [x] Support summarizing peripheral chunks
-- [x] Change validation to be pairwise comparisons (for map, at least) (Aug 14 & 15)
-  - [x] Only compare the plans that are highest scoring
-- [x] Support unnesting
-- [x] Reduce operator: support reduce keys as list
-- [x] Refactor map optimizer
-- [x] In map optimizer, when creating a split, add a uuid to each record being split (instead of relying on some doc id)
-- [ ] Recursively optimize operations (e.g., reduces in maps) (Aug 16 & 17 & 19)
-  - [x] In map optimizer: if the submap output is a list, then we should add an unnest operation
-  - [x] In reduce optimizer: query agent if we should drill-down / do a subreduce
-  - [x] In map optimizer: prune the chunk size plans that don't give individually good results for the chunks
-  - [x] In map optimizer: optimize the reduce operator for each chunk size plan
-  - [x] In reduce optimizer: synthesize resolver if need be
-  - [x] In resolve optimizer, support list-type reduce keys
-- [ ] Operator reordering
-  - [ ] support equivalence: map -> unnest -> reduce might be same as split -> gather -> map -> unnest -> reduce (no need to have a reduce right after map)
-- [x] Run tests in CI
-- [x] Support retry on validation failure
-- [x] Break down split into split + gather (Aug 21 & 22)
-  - [x] Support this in runner too
-- [x] Support more flexible chunking strategies
-  - [x] Delimiter based splitting
-    - [x] Encode this in API somehow
-    - [ ] Support this kind of chunking in the optimizer
-  - [x] Extract headers & levels from documents, and add the level hierarchy to the chunk.
-- [x] Support tool use in map operators
-- [x] Support prompts exceeding context windows; figure out how to throw out data / prioritize elements
-- [x] Support retries in the optimizers
-- [x] Operations should not be defined as dictionaries; they should be objects
-- [x] Support unnests in the optimizer
-- [x] Print out the name of the plan we are synthesizing
-- [x] Add gleaning plan to reduce
-- [x] Reduce optimizer should get a human to confirm if a drill-down roll-up decomposition makes sense
-- [ ] Allow gleaning model to be different from the main op model
-- [ ] HITL for prompt selection (generally, a textual app)
-- [ ] Fix bug in recursively optimizing reduce in the map optimizer
-- [x] Support reduce key of "all"
-- [ ] Change reduce_key to group_py
-- [ ] Write tests for optimizers
-- [ ] Refactor reduce and join optimizers
-- [ ] Support prompt prefix/custom instructions so users don't have to put them in every operation
-- [ ] Filter optimizer
-  - [x] Extend map optimizer to support filter
-  - [ ] Train an embedding classifier for filter
-- [ ] Support passing expectations
-- [x] Write intermediates to disk
-- [ ] Support order by
-- [ ] Reduce operations: eagerly process merges to prevent against stragglers/tail latencies in folds?
-- [ ] Rewrite API for equijoin input data
-
-### Experiment todos
-
-- [ ] Allow for few-shot examples for each operation, to use for the optimizer (especially joins)
-
-Things to think about
-
-- Filter chunks before applying the map prompt
-- Reduce does not need to be an LLM call:
-  - it can just be a concatenation of the inputs to the potential LLM call
-  - it could also be some normal aggregation (e.g., summing up the counts of symptoms, doing a conjunction or disjunction of intermediate outputs for a filter operation)
-- If the user specifies a map call in 2 different ways, they should get the same result. E.g., say they want to get a list of all the symptoms referenced in the medical transcript and what caused the symptoms.
-- Resolves should support resolves within groups, not necessarily a global resolve
-- Synthesize empty resolve in either builder or reduce optimizer, not both
-- Figure out how to run validators when data is too large to fit in the prompt (need to randomly sample part of the document)
-- In reduce optimizer: if agent suggests drill-down, see if we need to add a map to create the subreduce keys, or the subreduce key already exists
-- Try various combine prompts in the reduce optimizer
-- Filter optimizer: we should recursively optimize reduces if the reduce isn't good on its own
-- Support retry on val failure for operations beyond map/filter
-- If reduce input is too big to fit in the prompt, prompt for a map operation
-- Pipeline optimization: group maps and reduces together after one pass of optimization
diff --git a/vision.md b/vision.md
deleted file mode 100644
index 5c3f267f..00000000
--- a/vision.md
+++ /dev/null
@@ -1,12 +0,0 @@
-# Vision
-
-Things I'd like for the interface/agents to do:
-
-- Don't synthesize operators that are too complex. No complex schemas. Intermediate steps should be as unstructured as possible, leaving structured generation to the final output.
-- Optimize _one operation_ at a time. This way the user can see the intermediate steps and understand what's happening, and help modify prompts as needed.
-- Users should be in control of validation prompts.
-- When users are looking at intermediates, we should have the ability to run validators on the intermediate prompts.
-- We need to store intermediates and have provenance.
-- Have an interface to interactively create DocETL pipelines. Start by users defining a high-level task, and optimize one operation at a time.
-- Synthesize validate statements for each operation during optimization.
-- When generating chunking plans, use an LLM agent to deterimine what chunking plans to synthesize. E.g., it should be able to tell us whether peripheral context is necessary to include in the chunk.
diff --git a/website/src/components/FileExplorer.tsx b/website/src/components/FileExplorer.tsx
index 54f37d76..3b5fe785 100644
--- a/website/src/components/FileExplorer.tsx
+++ b/website/src/components/FileExplorer.tsx
@@ -12,6 +12,7 @@ import {
   Database,
   AlertTriangle,
   AlertCircle,
+  Globe,
 } from "lucide-react";
 import { Button } from "@/components/ui/button";
 import { Input } from "@/components/ui/input";
@@ -33,6 +34,7 @@ import {
   DialogContent,
   DialogHeader,
   DialogTitle,
+  DialogDescription,
 } from "@/components/ui/dialog";
 import { useToast } from "@/hooks/use-toast";
 import { DocumentViewer } from "./DocumentViewer";
@@ -149,50 +151,72 @@ async function getAllFiles(entry: FileSystemEntry): Promise<FileWithPath[]> {
 
 type ConversionMethod = "local" | "azure" | "docetl" | "custom-docling";
 
-async function validateJsonDataset(file: Blob): Promise<void> {
-  const text = await file.text();
-  let data: unknown;
-
-  try {
-    data = JSON.parse(text);
-  } catch {
-    throw new Error("Invalid JSON format");
-  }
-
-  // Check if it's an array
-  if (!Array.isArray(data)) {
-    throw new Error(
-      "Dataset must be an array of objects, like this: [{key: value}, {key: value}]"
-    );
-  }
-
-  // Check if array is not empty
-  if (data.length === 0) {
-    throw new Error("Dataset cannot be empty");
-  }
-
-  // Check if first item is an object
-  if (typeof data[0] !== "object" || data[0] === null) {
-    throw new Error("Dataset must contain objects");
-  }
+interface RemoteDatasetDialogProps {
+  isOpen: boolean;
+  onClose: () => void;
+  onSubmit: (url: string) => Promise<void>;
+}
 
-  // Get keys of first object
-  const firstObjectKeys = Object.keys(data[0]).sort();
+const RemoteDatasetDialog: React.FC<RemoteDatasetDialogProps> = ({
+  isOpen,
+  onClose,
+  onSubmit,
+}) => {
+  const [url, setUrl] = useState("");
+  const [isSubmitting, setIsSubmitting] = useState(false);
 
-  // Check if all objects have the same keys
-  const hasConsistentKeys = data.every((item) => {
-    if (typeof item !== "object" || item === null) return false;
-    const currentKeys = Object.keys(item).sort();
-    return (
-      currentKeys.length === firstObjectKeys.length &&
-      currentKeys.every((key, index) => key === firstObjectKeys[index])
-    );
-  });
+  const handleSubmit = async (e: React.FormEvent) => {
+    e.preventDefault();
+    setIsSubmitting(true);
+    try {
+      await onSubmit(url);
+      onClose();
+    } finally {
+      setIsSubmitting(false);
+    }
+  };
 
-  if (!hasConsistentKeys) {
-    throw new Error("All objects in dataset must have the same keys");
-  }
-}
+  return (
+    <Dialog open={isOpen} onOpenChange={(open) => !open && onClose()}>
+      <DialogContent>
+        <DialogHeader>
+          <DialogTitle>Upload Remote Dataset</DialogTitle>
+          <DialogDescription>
+            Enter the URL of a publicly accessible JSON or CSV file
+          </DialogDescription>
+        </DialogHeader>
+        <form onSubmit={handleSubmit} className="space-y-4">
+          <div className="space-y-2">
+            <Label htmlFor="url">Dataset URL</Label>
+            <Input
+              id="url"
+              type="url"
+              placeholder="https://example.com/dataset.json"
+              value={url}
+              onChange={(e) => setUrl(e.target.value)}
+              required
+            />
+          </div>
+          <div className="flex justify-end space-x-2">
+            <Button variant="outline" type="button" onClick={onClose}>
+              Cancel
+            </Button>
+            <Button type="submit" disabled={isSubmitting}>
+              {isSubmitting ? (
+                <>
+                  <Loader2 className="mr-2 h-4 w-4 animate-spin" />
+                  Uploading...
+                </>
+              ) : (
+                "Upload"
+              )}
+            </Button>
+          </div>
+        </form>
+      </DialogContent>
+    </Dialog>
+  );
+};
 
 export const FileExplorer: React.FC<FileExplorerProps> = ({
   files,
@@ -216,12 +240,15 @@ export const FileExplorer: React.FC<FileExplorerProps> = ({
   const [azureEndpoint, setAzureEndpoint] = useState("");
   const [azureKey, setAzureKey] = useState("");
   const [customDoclingUrl, setCustomDoclingUrl] = useState("");
-
-  const { uploadingFiles, uploadDataset } = useDatasetUpload({
-    namespace,
-    onFileUpload,
-    setCurrentFile,
-  });
+  const [isRemoteDatasetDialogOpen, setIsRemoteDatasetDialogOpen] =
+    useState(false);
+
+  const { uploadingFiles, uploadLocalDataset, uploadRemoteDataset } =
+    useDatasetUpload({
+      namespace,
+      onFileUpload,
+      setCurrentFile,
+    });
 
   // Group files by folder
   const groupedFiles = files.reduce((acc: { [key: string]: File[] }, file) => {
@@ -255,7 +282,7 @@ export const FileExplorer: React.FC<FileExplorerProps> = ({
       type: "json",
       blob: uploadedFile,
     };
-    await uploadDataset(fileToUpload);
+    await uploadLocalDataset(fileToUpload);
   };
 
   const handleFileSelection = (file: File) => {
@@ -306,11 +333,17 @@ export const FileExplorer: React.FC<FileExplorerProps> = ({
   };
 
   useEffect(() => {
-    if (!isUploadDialogOpen && !viewingDocument && !folderToDelete) {
+    const someDialogOpen = isUploadDialogOpen || isRemoteDatasetDialogOpen;
+    if (!someDialogOpen && !viewingDocument && !folderToDelete) {
       // Reset pointer-events after the dialog closes
       document.body.style.pointerEvents = "auto";
     }
-  }, [isUploadDialogOpen, viewingDocument, folderToDelete]);
+  }, [
+    isUploadDialogOpen,
+    viewingDocument,
+    folderToDelete,
+    isRemoteDatasetDialogOpen,
+  ]);
 
   const handleDialogClose = () => {
     // Clear the state and close the dialog
@@ -524,7 +557,7 @@ export const FileExplorer: React.FC<FileExplorerProps> = ({
               <div className="flex items-center w-full">
                 <Input
                   type="file"
-                  accept=".json,application/json"
+                  accept=".json,.csv"
                   onChange={(e) => {
                     handleFileUpload(e);
                     e.currentTarget.value = "";
@@ -549,14 +582,46 @@ export const FileExplorer: React.FC<FileExplorerProps> = ({
                   <span>
                     {uploadingFiles.size > 0
                       ? "Uploading dataset..."
-                      : "Upload dataset.json"}
+                      : "Upload Local Dataset"}
                   </span>
                 </label>
               </div>
             </DropdownMenuItem>
-            <DropdownMenuItem onClick={() => setIsUploadDialogOpen(true)}>
-              <FolderUp className="mr-2 h-4 w-4" />
-              <span>Upload Files or Folder</span>
+            <DropdownMenuItem
+              onClick={() => setIsRemoteDatasetDialogOpen(true)}
+              disabled={uploadingFiles.size > 0}
+              className={`flex items-center w-full cursor-pointer ${
+                uploadingFiles.size > 0 ? "opacity-50 cursor-not-allowed" : ""
+              }`}
+            >
+              {uploadingFiles.size > 0 ? (
+                <Loader2 className="mr-2 h-4 w-4 animate-spin" />
+              ) : (
+                <Globe className="mr-2 h-4 w-4" />
+              )}
+              <span>
+                {uploadingFiles.size > 0
+                  ? "Uploading dataset..."
+                  : "Upload Remote Dataset"}
+              </span>
+            </DropdownMenuItem>
+            <DropdownMenuItem
+              onClick={() => setIsUploadDialogOpen(true)}
+              disabled={uploadingFiles.size > 0}
+              className={`flex items-center w-full cursor-pointer ${
+                uploadingFiles.size > 0 ? "opacity-50 cursor-not-allowed" : ""
+              }`}
+            >
+              {uploadingFiles.size > 0 ? (
+                <Loader2 className="mr-2 h-4 w-4 animate-spin" />
+              ) : (
+                <FolderUp className="mr-2 h-4 w-4" />
+              )}
+              <span>
+                {uploadingFiles.size > 0
+                  ? "Uploading files..."
+                  : "Upload Files or Folder"}
+              </span>
             </DropdownMenuItem>
           </DropdownMenuContent>
         </DropdownMenu>
@@ -1068,6 +1133,14 @@ export const FileExplorer: React.FC<FileExplorerProps> = ({
           </AlertDialogContent>
         </AlertDialog>
       )}
+
+      {isRemoteDatasetDialogOpen && (
+        <RemoteDatasetDialog
+          isOpen={isRemoteDatasetDialogOpen}
+          onClose={() => setIsRemoteDatasetDialogOpen(false)}
+          onSubmit={uploadRemoteDataset}
+        />
+      )}
     </div>
   );
 };
diff --git a/website/src/components/TutorialsDialog.tsx b/website/src/components/TutorialsDialog.tsx
index 2252459d..7c3aa5c8 100644
--- a/website/src/components/TutorialsDialog.tsx
+++ b/website/src/components/TutorialsDialog.tsx
@@ -284,7 +284,7 @@ export function TutorialsDialog({
   files,
 }: TutorialsDialogProps) {
   const { toast } = useToast();
-  const { uploadDataset } = useDatasetUpload({
+  const { uploadLocalDataset } = useDatasetUpload({
     namespace,
     onFileUpload,
     setCurrentFile,
@@ -405,7 +405,7 @@ export function TutorialsDialog({
       setUploadedDatasetPath(datasetFileName);
 
       // Upload dataset and wait for currentFile to update
-      await uploadDataset(datasetFile);
+      await uploadLocalDataset(datasetFile);
     } catch (error) {
       console.error("Error loading tutorial:", error);
       toast({
diff --git a/website/src/hooks/useDatasetUpload.ts b/website/src/hooks/useDatasetUpload.ts
index 0e552e9b..25cbf024 100644
--- a/website/src/hooks/useDatasetUpload.ts
+++ b/website/src/hooks/useDatasetUpload.ts
@@ -2,6 +2,7 @@ import { useState } from "react";
 import { useToast } from "@/hooks/use-toast";
 import type { File } from "@/app/types";
 import { getBackendUrl } from "@/lib/api-config";
+import Papa from "papaparse";
 
 interface UseDatasetUploadOptions {
   namespace: string;
@@ -17,16 +18,7 @@ export function useDatasetUpload({
   const { toast } = useToast();
   const [uploadingFiles, setUploadingFiles] = useState<Set<string>>(new Set());
 
-  async function validateJsonDataset(file: Blob): Promise<void> {
-    const text = await file.text();
-    let data: unknown;
-
-    try {
-      data = JSON.parse(text);
-    } catch {
-      throw new Error("Invalid JSON format");
-    }
-
+  async function validateJsonDataset(data: unknown): Promise<void> {
     // Check if it's an array
     if (!Array.isArray(data)) {
       throw new Error(
@@ -62,12 +54,36 @@ export function useDatasetUpload({
     }
   }
 
-  const uploadDataset = async (file: File) => {
-    if (!file.name.toLowerCase().endsWith(".json")) {
+  const convertCsvToJson = (csvText: string): Promise<unknown[]> => {
+    return new Promise((resolve, reject) => {
+      Papa.parse(csvText, {
+        header: true,
+        dynamicTyping: true,
+        skipEmptyLines: true,
+        complete: (results) => {
+          if (results.errors.length > 0) {
+            reject(
+              new Error(`CSV parsing error: ${results.errors[0].message}`)
+            );
+          } else {
+            resolve(results.data);
+          }
+        },
+        error: (error) => {
+          reject(new Error(`CSV parsing error: ${error.message}`));
+        },
+      });
+    });
+  };
+
+  const uploadLocalDataset = async (file: File) => {
+    const fileExtension = file.name.toLowerCase().split(".").pop();
+
+    if (!["json", "csv"].includes(fileExtension || "")) {
       toast({
         variant: "destructive",
         title: "Error",
-        description: "Please upload a JSON file",
+        description: "Please upload a JSON or CSV file",
       });
       return;
     }
@@ -80,10 +96,30 @@ export function useDatasetUpload({
     setUploadingFiles((prev) => new Set(prev).add(file.name));
 
     try {
-      await validateJsonDataset(file.blob);
+      let jsonData: unknown;
+
+      if (fileExtension === "csv") {
+        const csvText = await file.blob.text();
+        jsonData = await convertCsvToJson(csvText);
+      } else {
+        const text = await file.blob.text();
+        try {
+          jsonData = JSON.parse(text);
+        } catch {
+          throw new Error("Invalid JSON format");
+        }
+      }
 
+      await validateJsonDataset(jsonData);
+
+      // Convert the JSON data back to a blob for upload
+      const jsonBlob = new Blob([JSON.stringify(jsonData)], {
+        type: "application/json",
+      });
       const formData = new FormData();
-      formData.append("file", file.blob);
+      // Always save as .json regardless of input format
+      const fileName = file.name.replace(/\.(json|csv)$/, ".json");
+      formData.append("file", jsonBlob, fileName);
       formData.append("namespace", namespace);
 
       const response = await fetch(`${getBackendUrl()}/fs/upload-file`, {
@@ -98,7 +134,7 @@ export function useDatasetUpload({
       const data = await response.json();
 
       const newFile = {
-        name: file.name,
+        name: fileName,
         path: data.path,
         type: "json" as const,
         parentFolder: "root",
@@ -128,8 +164,71 @@ export function useDatasetUpload({
     }
   };
 
+  const uploadRemoteDataset = async (url: string) => {
+    const fileName = url.split("/").pop() || "dataset.json";
+    setUploadingFiles((prev) => new Set(prev).add(fileName));
+
+    try {
+      toast({
+        title: "Downloading remote dataset...",
+        description: "This may take a few seconds",
+      });
+
+      const formData = new FormData();
+      formData.append("url", url);
+      formData.append("namespace", namespace);
+
+      const response = await fetch(`${getBackendUrl()}/fs/upload-file`, {
+        method: "POST",
+        body: formData,
+      });
+
+      if (!response.ok) {
+        // Get the response details
+        const errorDetails = await response.json();
+        throw new Error(
+          errorDetails.detail || "Failed to fetch remote dataset"
+        );
+      }
+
+      const data = await response.json();
+
+      const newFile = {
+        name: fileName.replace(/\.(json|csv)$/, ".json"),
+        path: data.path,
+        type: "json" as const,
+        parentFolder: "root",
+      };
+
+      onFileUpload(newFile);
+      setCurrentFile(newFile);
+
+      toast({
+        title: "Success",
+        description: "Remote dataset downloaded and processed successfully",
+      });
+    } catch (error) {
+      console.error(error);
+      toast({
+        variant: "destructive",
+        title: "Error",
+        description:
+          error instanceof Error
+            ? error.message
+            : "Failed to fetch remote dataset",
+      });
+    } finally {
+      setUploadingFiles((prev) => {
+        const next = new Set(prev);
+        next.delete(fileName);
+        return next;
+      });
+    }
+  };
+
   return {
     uploadingFiles,
-    uploadDataset,
+    uploadLocalDataset,
+    uploadRemoteDataset,
   };
 }