Skip to content

Commit 6eeb95e

Browse files
authored
feat(API): Ingest plain text (#1417)
* Add ingest/text route to ingest plain text * Add new ingest text test and adapt ingest/file ones * Include new API in docs * Remove duplicated logic
1 parent 059f358 commit 6eeb95e

File tree

6 files changed

+198
-17
lines changed

6 files changed

+198
-17
lines changed

fern/openapi/openapi.json

Lines changed: 124 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@
137137
"Ingestion"
138138
],
139139
"summary": "Ingest",
140-
"description": "Ingests and processes a file, storing its chunks to be used as context.\n\nThe context obtained from files is later used in\n`/chat/completions`, `/completions`, and `/chunks` APIs.\n\nMost common document\nformats are supported, but you may be prompted to install an extra dependency to\nmanage a specific file type.\n\nA file can generate different Documents (for example a PDF generates one Document\nper page). All Documents IDs are returned in the response, together with the\nextracted Metadata (which is later used to improve context retrieval). Those IDs\ncan be used to filter the context used to create responses in\n`/chat/completions`, `/completions`, and `/chunks` APIs.",
140+
"description": "Ingests and processes a file.\n\nDeprecated. Use ingest/file instead.",
141141
"operationId": "ingest_v1_ingest_post",
142142
"requestBody": {
143143
"content": {
@@ -149,6 +149,91 @@
149149
},
150150
"required": true
151151
},
152+
"responses": {
153+
"200": {
154+
"description": "Successful Response",
155+
"content": {
156+
"application/json": {
157+
"schema": {
158+
"$ref": "#/components/schemas/IngestResponse"
159+
}
160+
}
161+
}
162+
},
163+
"422": {
164+
"description": "Validation Error",
165+
"content": {
166+
"application/json": {
167+
"schema": {
168+
"$ref": "#/components/schemas/HTTPValidationError"
169+
}
170+
}
171+
}
172+
}
173+
},
174+
"deprecated": true
175+
}
176+
},
177+
"/v1/ingest/file": {
178+
"post": {
179+
"tags": [
180+
"Ingestion"
181+
],
182+
"summary": "Ingest File",
183+
"description": "Ingests and processes a file, storing its chunks to be used as context.\n\nThe context obtained from files is later used in\n`/chat/completions`, `/completions`, and `/chunks` APIs.\n\nMost common document\nformats are supported, but you may be prompted to install an extra dependency to\nmanage a specific file type.\n\nA file can generate different Documents (for example a PDF generates one Document\nper page). All Documents IDs are returned in the response, together with the\nextracted Metadata (which is later used to improve context retrieval). Those IDs\ncan be used to filter the context used to create responses in\n`/chat/completions`, `/completions`, and `/chunks` APIs.",
184+
"operationId": "ingest_file_v1_ingest_file_post",
185+
"requestBody": {
186+
"content": {
187+
"multipart/form-data": {
188+
"schema": {
189+
"$ref": "#/components/schemas/Body_ingest_file_v1_ingest_file_post"
190+
}
191+
}
192+
},
193+
"required": true
194+
},
195+
"responses": {
196+
"200": {
197+
"description": "Successful Response",
198+
"content": {
199+
"application/json": {
200+
"schema": {
201+
"$ref": "#/components/schemas/IngestResponse"
202+
}
203+
}
204+
}
205+
},
206+
"422": {
207+
"description": "Validation Error",
208+
"content": {
209+
"application/json": {
210+
"schema": {
211+
"$ref": "#/components/schemas/HTTPValidationError"
212+
}
213+
}
214+
}
215+
}
216+
}
217+
}
218+
},
219+
"/v1/ingest/text": {
220+
"post": {
221+
"tags": [
222+
"Ingestion"
223+
],
224+
"summary": "Ingest Text",
225+
"description": "Ingests and processes a text, storing its chunks to be used as context.\n\nThe context obtained from files is later used in\n`/chat/completions`, `/completions`, and `/chunks` APIs.\n\nA Document will be generated with the given text. The Document\nID is returned in the response, together with the\nextracted Metadata (which is later used to improve context retrieval). That ID\ncan be used to filter the context used to create responses in\n`/chat/completions`, `/completions`, and `/chunks` APIs.",
226+
"operationId": "ingest_text_v1_ingest_text_post",
227+
"requestBody": {
228+
"content": {
229+
"application/json": {
230+
"schema": {
231+
"$ref": "#/components/schemas/IngestTextBody"
232+
}
233+
}
234+
},
235+
"required": true
236+
},
152237
"responses": {
153238
"200": {
154239
"description": "Successful Response",
@@ -303,6 +388,20 @@
303388
},
304389
"components": {
305390
"schemas": {
391+
"Body_ingest_file_v1_ingest_file_post": {
392+
"properties": {
393+
"file": {
394+
"type": "string",
395+
"format": "binary",
396+
"title": "File"
397+
}
398+
},
399+
"type": "object",
400+
"required": [
401+
"file"
402+
],
403+
"title": "Body_ingest_file_v1_ingest_file_post"
404+
},
306405
"Body_ingest_v1_ingest_post": {
307406
"properties": {
308407
"file": {
@@ -735,6 +834,30 @@
735834
],
736835
"title": "IngestResponse"
737836
},
837+
"IngestTextBody": {
838+
"properties": {
839+
"file_name": {
840+
"type": "string",
841+
"title": "File Name",
842+
"examples": [
843+
"Avatar: The Last Airbender"
844+
]
845+
},
846+
"text": {
847+
"type": "string",
848+
"title": "Text",
849+
"examples": [
850+
"Avatar is set in an Asian and Arctic-inspired world in which some people can telekinetically manipulate one of the four elements\u2014water, earth, fire or air\u2014through practices known as 'bending', inspired by Chinese martial arts."
851+
]
852+
}
853+
},
854+
"type": "object",
855+
"required": [
856+
"file_name",
857+
"text"
858+
],
859+
"title": "IngestTextBody"
860+
},
738861
"IngestedDoc": {
739862
"properties": {
740863
"object": {

private_gpt/server/ingest/ingest_router.py

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from typing import Literal
22

33
from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile
4-
from pydantic import BaseModel
4+
from pydantic import BaseModel, Field
55

66
from private_gpt.server.ingest.ingest_service import IngestService
77
from private_gpt.server.ingest.model import IngestedDoc
@@ -10,14 +10,35 @@
1010
ingest_router = APIRouter(prefix="/v1", dependencies=[Depends(authenticated)])
1111

1212

13+
class IngestTextBody(BaseModel):
14+
file_name: str = Field(examples=["Avatar: The Last Airbender"])
15+
text: str = Field(
16+
examples=[
17+
"Avatar is set in an Asian and Arctic-inspired world in which some "
18+
"people can telekinetically manipulate one of the four elements—water, "
19+
"earth, fire or air—through practices known as 'bending', inspired by "
20+
"Chinese martial arts."
21+
]
22+
)
23+
24+
1325
class IngestResponse(BaseModel):
1426
object: Literal["list"]
1527
model: Literal["private-gpt"]
1628
data: list[IngestedDoc]
1729

1830

19-
@ingest_router.post("/ingest", tags=["Ingestion"])
31+
@ingest_router.post("/ingest", tags=["Ingestion"], deprecated=True)
2032
def ingest(request: Request, file: UploadFile) -> IngestResponse:
33+
"""Ingests and processes a file.
34+
35+
Deprecated. Use ingest/file instead.
36+
"""
37+
return ingest_file(request, file)
38+
39+
40+
@ingest_router.post("/ingest/file", tags=["Ingestion"])
41+
def ingest_file(request: Request, file: UploadFile) -> IngestResponse:
2142
"""Ingests and processes a file, storing its chunks to be used as context.
2243
2344
The context obtained from files is later used in
@@ -40,6 +61,26 @@ def ingest(request: Request, file: UploadFile) -> IngestResponse:
4061
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
4162

4263

64+
@ingest_router.post("/ingest/text", tags=["Ingestion"])
65+
def ingest_text(request: Request, body: IngestTextBody) -> IngestResponse:
66+
"""Ingests and processes a text, storing its chunks to be used as context.
67+
68+
The context obtained from files is later used in
69+
`/chat/completions`, `/completions`, and `/chunks` APIs.
70+
71+
A Document will be generated with the given text. The Document
72+
ID is returned in the response, together with the
73+
extracted Metadata (which is later used to improve context retrieval). That ID
74+
can be used to filter the context used to create responses in
75+
`/chat/completions`, `/completions`, and `/chunks` APIs.
76+
"""
77+
service = request.state.injector.get(IngestService)
78+
if len(body.file_name) == 0:
79+
raise HTTPException(400, "No file name provided")
80+
ingested_documents = service.ingest_text(body.file_name, body.text)
81+
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
82+
83+
4384
@ingest_router.get("/ingest/list", tags=["Ingestion"])
4485
def list_ingested(request: Request) -> IngestResponse:
4586
"""Lists already ingested Documents including their Document ID and metadata.

private_gpt/server/ingest/ingest_service.py

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import logging
22
import tempfile
33
from pathlib import Path
4-
from typing import BinaryIO
4+
from typing import AnyStr, BinaryIO
55

66
from injector import inject, singleton
77
from llama_index import (
@@ -53,16 +53,7 @@ def __init__(
5353
self.storage_context, self.ingest_service_context, settings=settings()
5454
)
5555

56-
def ingest(self, file_name: str, file_data: Path) -> list[IngestedDoc]:
57-
logger.info("Ingesting file_name=%s", file_name)
58-
documents = self.ingest_component.ingest(file_name, file_data)
59-
return [IngestedDoc.from_document(document) for document in documents]
60-
61-
def ingest_bin_data(
62-
self, file_name: str, raw_file_data: BinaryIO
63-
) -> list[IngestedDoc]:
64-
logger.debug("Ingesting binary data with file_name=%s", file_name)
65-
file_data = raw_file_data.read()
56+
def _ingest_data(self, file_name: str, file_data: AnyStr) -> list[IngestedDoc]:
6657
logger.debug("Got file data of size=%s to ingest", len(file_data))
6758
# llama-index mainly supports reading from files, so
6859
# we have to create a tmp file to read for it to work
@@ -74,11 +65,27 @@ def ingest_bin_data(
7465
path_to_tmp.write_bytes(file_data)
7566
else:
7667
path_to_tmp.write_text(str(file_data))
77-
return self.ingest(file_name, path_to_tmp)
68+
return self.ingest_file(file_name, path_to_tmp)
7869
finally:
7970
tmp.close()
8071
path_to_tmp.unlink()
8172

73+
def ingest_file(self, file_name: str, file_data: Path) -> list[IngestedDoc]:
74+
logger.info("Ingesting file_name=%s", file_name)
75+
documents = self.ingest_component.ingest(file_name, file_data)
76+
return [IngestedDoc.from_document(document) for document in documents]
77+
78+
def ingest_text(self, file_name: str, text: str) -> list[IngestedDoc]:
79+
logger.debug("Ingesting text data with file_name=%s", file_name)
80+
return self._ingest_data(file_name, text)
81+
82+
def ingest_bin_data(
83+
self, file_name: str, raw_file_data: BinaryIO
84+
) -> list[IngestedDoc]:
85+
logger.debug("Ingesting binary data with file_name=%s", file_name)
86+
file_data = raw_file_data.read()
87+
return self._ingest_data(file_name, file_data)
88+
8289
def bulk_ingest(self, files: list[tuple[str, Path]]) -> list[IngestedDoc]:
8390
logger.info("Ingesting file_names=%s", [f[0] for f in files])
8491
documents = self.ingest_component.bulk_ingest(files)

scripts/ingest_folder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def _do_ingest_one(self, changed_path: Path) -> None:
4848
try:
4949
if changed_path.exists():
5050
logger.info(f"Started ingesting file={changed_path}")
51-
self.ingest_service.ingest(changed_path.name, changed_path)
51+
self.ingest_service.ingest_file(changed_path.name, changed_path)
5252
logger.info(f"Completed ingesting file={changed_path}")
5353
except Exception:
5454
logger.exception(

tests/fixtures/ingest_helper.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ def __init__(self, test_client: TestClient):
1313
def ingest_file(self, path: Path) -> IngestResponse:
1414
files = {"file": (path.name, path.open("rb"))}
1515

16-
response = self.test_client.post("/v1/ingest", files=files)
16+
response = self.test_client.post("/v1/ingest/file", files=files)
1717
assert response.status_code == 200
1818
ingest_result = IngestResponse.model_validate(response.json())
1919
return ingest_result

tests/server/ingest/test_ingest_routes.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
from fastapi.testclient import TestClient
55

6+
from private_gpt.server.ingest.ingest_router import IngestResponse
67
from tests.fixtures.ingest_helper import IngestHelper
78

89

@@ -34,3 +35,12 @@ def test_ingest_list_returns_something_after_ingestion(
3435
assert (
3536
count_ingest_after == count_ingest_before + 1
3637
), "The temp doc should be returned"
38+
39+
40+
def test_ingest_plain_text(test_client: TestClient) -> None:
41+
response = test_client.post(
42+
"/v1/ingest/text", json={"file_name": "file_name", "text": "text"}
43+
)
44+
assert response.status_code == 200
45+
ingest_result = IngestResponse.model_validate(response.json())
46+
assert len(ingest_result.data) == 1

0 commit comments

Comments
 (0)