Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ingest plain text #1417

Merged
merged 4 commits into from
Dec 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 124 additions & 1 deletion fern/openapi/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@
"Ingestion"
],
"summary": "Ingest",
"description": "Ingests and processes a file, storing its chunks to be used as context.\n\nThe context obtained from files is later used in\n`/chat/completions`, `/completions`, and `/chunks` APIs.\n\nMost common document\nformats are supported, but you may be prompted to install an extra dependency to\nmanage a specific file type.\n\nA file can generate different Documents (for example a PDF generates one Document\nper page). All Documents IDs are returned in the response, together with the\nextracted Metadata (which is later used to improve context retrieval). Those IDs\ncan be used to filter the context used to create responses in\n`/chat/completions`, `/completions`, and `/chunks` APIs.",
"description": "Ingests and processes a file.\n\nDeprecated. Use ingest/file instead.",
"operationId": "ingest_v1_ingest_post",
"requestBody": {
"content": {
Expand All @@ -149,6 +149,91 @@
},
"required": true
},
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/IngestResponse"
}
}
}
},
"422": {
"description": "Validation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/HTTPValidationError"
}
}
}
}
},
"deprecated": true
}
},
"/v1/ingest/file": {
"post": {
"tags": [
"Ingestion"
],
"summary": "Ingest File",
"description": "Ingests and processes a file, storing its chunks to be used as context.\n\nThe context obtained from files is later used in\n`/chat/completions`, `/completions`, and `/chunks` APIs.\n\nMost common document\nformats are supported, but you may be prompted to install an extra dependency to\nmanage a specific file type.\n\nA file can generate different Documents (for example a PDF generates one Document\nper page). All Documents IDs are returned in the response, together with the\nextracted Metadata (which is later used to improve context retrieval). Those IDs\ncan be used to filter the context used to create responses in\n`/chat/completions`, `/completions`, and `/chunks` APIs.",
"operationId": "ingest_file_v1_ingest_file_post",
"requestBody": {
"content": {
"multipart/form-data": {
"schema": {
"$ref": "#/components/schemas/Body_ingest_file_v1_ingest_file_post"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/IngestResponse"
}
}
}
},
"422": {
"description": "Validation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/HTTPValidationError"
}
}
}
}
}
}
},
"/v1/ingest/text": {
"post": {
"tags": [
"Ingestion"
],
"summary": "Ingest Text",
"description": "Ingests and processes a text, storing its chunks to be used as context.\n\nThe context obtained from files is later used in\n`/chat/completions`, `/completions`, and `/chunks` APIs.\n\nA Document will be generated with the given text. The Document\nID is returned in the response, together with the\nextracted Metadata (which is later used to improve context retrieval). That ID\ncan be used to filter the context used to create responses in\n`/chat/completions`, `/completions`, and `/chunks` APIs.",
"operationId": "ingest_text_v1_ingest_text_post",
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/IngestTextBody"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Successful Response",
Expand Down Expand Up @@ -303,6 +388,20 @@
},
"components": {
"schemas": {
"Body_ingest_file_v1_ingest_file_post": {
"properties": {
"file": {
"type": "string",
"format": "binary",
"title": "File"
}
},
"type": "object",
"required": [
"file"
],
"title": "Body_ingest_file_v1_ingest_file_post"
},
"Body_ingest_v1_ingest_post": {
"properties": {
"file": {
Expand Down Expand Up @@ -735,6 +834,30 @@
],
"title": "IngestResponse"
},
"IngestTextBody": {
"properties": {
"file_name": {
"type": "string",
"title": "File Name",
"examples": [
"Avatar: The Last Airbender"
]
},
"text": {
"type": "string",
"title": "Text",
"examples": [
"Avatar is set in an Asian and Arctic-inspired world in which some people can telekinetically manipulate one of the four elements\u2014water, earth, fire or air\u2014through practices known as 'bending', inspired by Chinese martial arts."
]
}
},
"type": "object",
"required": [
"file_name",
"text"
],
"title": "IngestTextBody"
},
"IngestedDoc": {
"properties": {
"object": {
Expand Down
45 changes: 43 additions & 2 deletions private_gpt/server/ingest/ingest_router.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Literal

from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile
from pydantic import BaseModel
from pydantic import BaseModel, Field

from private_gpt.server.ingest.ingest_service import IngestService
from private_gpt.server.ingest.model import IngestedDoc
Expand All @@ -10,14 +10,35 @@
ingest_router = APIRouter(prefix="/v1", dependencies=[Depends(authenticated)])


class IngestTextBody(BaseModel):
file_name: str = Field(examples=["Avatar: The Last Airbender"])
text: str = Field(
examples=[
"Avatar is set in an Asian and Arctic-inspired world in which some "
"people can telekinetically manipulate one of the four elements—water, "
"earth, fire or air—through practices known as 'bending', inspired by "
"Chinese martial arts."
]
)


class IngestResponse(BaseModel):
object: Literal["list"]
model: Literal["private-gpt"]
data: list[IngestedDoc]


@ingest_router.post("/ingest", tags=["Ingestion"])
@ingest_router.post("/ingest", tags=["Ingestion"], deprecated=True)
def ingest(request: Request, file: UploadFile) -> IngestResponse:
"""Ingests and processes a file.

Deprecated. Use ingest/file instead.
"""
return ingest_file(request, file)


@ingest_router.post("/ingest/file", tags=["Ingestion"])
def ingest_file(request: Request, file: UploadFile) -> IngestResponse:
"""Ingests and processes a file, storing its chunks to be used as context.

The context obtained from files is later used in
Expand All @@ -40,6 +61,26 @@ def ingest(request: Request, file: UploadFile) -> IngestResponse:
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)


@ingest_router.post("/ingest/text", tags=["Ingestion"])
def ingest_text(request: Request, body: IngestTextBody) -> IngestResponse:
"""Ingests and processes a text, storing its chunks to be used as context.

The context obtained from files is later used in
`/chat/completions`, `/completions`, and `/chunks` APIs.

A Document will be generated with the given text. The Document
ID is returned in the response, together with the
extracted Metadata (which is later used to improve context retrieval). That ID
can be used to filter the context used to create responses in
`/chat/completions`, `/completions`, and `/chunks` APIs.
"""
service = request.state.injector.get(IngestService)
if len(body.file_name) == 0:
raise HTTPException(400, "No file name provided")
ingested_documents = service.ingest_text(body.file_name, body.text)
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)


@ingest_router.get("/ingest/list", tags=["Ingestion"])
def list_ingested(request: Request) -> IngestResponse:
"""Lists already ingested Documents including their Document ID and metadata.
Expand Down
31 changes: 19 additions & 12 deletions private_gpt/server/ingest/ingest_service.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
import tempfile
from pathlib import Path
from typing import BinaryIO
from typing import AnyStr, BinaryIO

from injector import inject, singleton
from llama_index import (
Expand Down Expand Up @@ -53,16 +53,7 @@ def __init__(
self.storage_context, self.ingest_service_context, settings=settings()
)

def ingest(self, file_name: str, file_data: Path) -> list[IngestedDoc]:
logger.info("Ingesting file_name=%s", file_name)
documents = self.ingest_component.ingest(file_name, file_data)
return [IngestedDoc.from_document(document) for document in documents]

def ingest_bin_data(
self, file_name: str, raw_file_data: BinaryIO
) -> list[IngestedDoc]:
logger.debug("Ingesting binary data with file_name=%s", file_name)
file_data = raw_file_data.read()
def _ingest_data(self, file_name: str, file_data: AnyStr) -> list[IngestedDoc]:
logger.debug("Got file data of size=%s to ingest", len(file_data))
# llama-index mainly supports reading from files, so
# we have to create a tmp file to read for it to work
Expand All @@ -74,11 +65,27 @@ def ingest_bin_data(
path_to_tmp.write_bytes(file_data)
else:
path_to_tmp.write_text(str(file_data))
return self.ingest(file_name, path_to_tmp)
return self.ingest_file(file_name, path_to_tmp)
finally:
tmp.close()
path_to_tmp.unlink()

def ingest_file(self, file_name: str, file_data: Path) -> list[IngestedDoc]:
logger.info("Ingesting file_name=%s", file_name)
documents = self.ingest_component.ingest(file_name, file_data)
return [IngestedDoc.from_document(document) for document in documents]

def ingest_text(self, file_name: str, text: str) -> list[IngestedDoc]:
logger.debug("Ingesting text data with file_name=%s", file_name)
return self._ingest_data(file_name, text)

def ingest_bin_data(
self, file_name: str, raw_file_data: BinaryIO
) -> list[IngestedDoc]:
logger.debug("Ingesting binary data with file_name=%s", file_name)
file_data = raw_file_data.read()
return self._ingest_data(file_name, file_data)

def bulk_ingest(self, files: list[tuple[str, Path]]) -> list[IngestedDoc]:
logger.info("Ingesting file_names=%s", [f[0] for f in files])
documents = self.ingest_component.bulk_ingest(files)
Expand Down
2 changes: 1 addition & 1 deletion scripts/ingest_folder.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def _do_ingest_one(self, changed_path: Path) -> None:
try:
if changed_path.exists():
logger.info(f"Started ingesting file={changed_path}")
self.ingest_service.ingest(changed_path.name, changed_path)
self.ingest_service.ingest_file(changed_path.name, changed_path)
logger.info(f"Completed ingesting file={changed_path}")
except Exception:
logger.exception(
Expand Down
2 changes: 1 addition & 1 deletion tests/fixtures/ingest_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def __init__(self, test_client: TestClient):
def ingest_file(self, path: Path) -> IngestResponse:
files = {"file": (path.name, path.open("rb"))}

response = self.test_client.post("/v1/ingest", files=files)
response = self.test_client.post("/v1/ingest/file", files=files)
assert response.status_code == 200
ingest_result = IngestResponse.model_validate(response.json())
return ingest_result
Expand Down
10 changes: 10 additions & 0 deletions tests/server/ingest/test_ingest_routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from fastapi.testclient import TestClient

from private_gpt.server.ingest.ingest_router import IngestResponse
from tests.fixtures.ingest_helper import IngestHelper


Expand Down Expand Up @@ -34,3 +35,12 @@ def test_ingest_list_returns_something_after_ingestion(
assert (
count_ingest_after == count_ingest_before + 1
), "The temp doc should be returned"


def test_ingest_plain_text(test_client: TestClient) -> None:
response = test_client.post(
"/v1/ingest/text", json={"file_name": "file_name", "text": "text"}
)
assert response.status_code == 200
ingest_result = IngestResponse.model_validate(response.json())
assert len(ingest_result.data) == 1
Loading