Skip to content

Commit e54a8fe

Browse files
authored
fix: prevent to ingest local files (by default) (#2010)
* feat: prevent to local ingestion (by default) and add white-list * docs: add local ingestion warning * docs: add missing comment * fix: update exception error * fix: black
1 parent 1020cd5 commit e54a8fe

File tree

5 files changed

+133
-3
lines changed

5 files changed

+133
-3
lines changed

fern/docs/pages/manual/ingestion.mdx

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,14 @@ The ingestion of documents can be done in different ways:
88

99
## Bulk Local Ingestion
1010

11+
You will need to activate `data.local_ingestion.enabled` in your setting file to use this feature. Additionally,
12+
it is probably a good idea to set `data.local_ingestion.allow_ingest_from` to specify which folders are allowed to be ingested.
13+
14+
<Callout intent = "warning">
15+
Be careful enabling this feature in a production environment, as it can be a security risk, as it allows users to
16+
ingest any local file with permissions.
17+
</Callout>
18+
1119
When you are running PrivateGPT in a fully local setup, you can ingest a complete folder for convenience (containing
1220
pdf, text files, etc.)
1321
and optionally watch changes on it with the command:

private_gpt/settings/settings.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,27 @@ class AuthSettings(BaseModel):
5959
)
6060

6161

62+
class IngestionSettings(BaseModel):
63+
"""Ingestion configuration.
64+
65+
This configuration is used to control the ingestion of data into the system
66+
using non-server methods. This is useful for local development and testing;
67+
or to ingest in bulk from a folder.
68+
69+
Please note that this configuration is not secure and should be used in
70+
a controlled environment only (setting right permissions, etc.).
71+
"""
72+
73+
enabled: bool = Field(
74+
description="Flag indicating if local ingestion is enabled or not.",
75+
default=False,
76+
)
77+
allow_ingest_from: list[str] = Field(
78+
description="A list of folders that should be permitted to make ingest requests.",
79+
default=[],
80+
)
81+
82+
6283
class ServerSettings(BaseModel):
6384
env_name: str = Field(
6485
description="Name of the environment (prod, staging, local...)"
@@ -74,6 +95,10 @@ class ServerSettings(BaseModel):
7495

7596

7697
class DataSettings(BaseModel):
98+
local_ingestion: IngestionSettings = Field(
99+
description="Ingestion configuration",
100+
default_factory=lambda: IngestionSettings(allow_ingest_from=["*"]),
101+
)
77102
local_data_folder: str = Field(
78103
description="Path to local storage."
79104
"It will be treated as an absolute path if it starts with /"

scripts/ingest_folder.py

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,38 @@
77
from private_gpt.di import global_injector
88
from private_gpt.server.ingest.ingest_service import IngestService
99
from private_gpt.server.ingest.ingest_watcher import IngestWatcher
10+
from private_gpt.settings.settings import Settings
1011

1112
logger = logging.getLogger(__name__)
1213

1314

1415
class LocalIngestWorker:
15-
def __init__(self, ingest_service: IngestService) -> None:
16+
def __init__(self, ingest_service: IngestService, setting: Settings) -> None:
1617
self.ingest_service = ingest_service
1718

1819
self.total_documents = 0
1920
self.current_document_count = 0
2021

2122
self._files_under_root_folder: list[Path] = []
2223

24+
self.is_local_ingestion_enabled = setting.data.local_ingestion.enabled
25+
self.allowed_local_folders = setting.data.local_ingestion.allow_ingest_from
26+
27+
def _validate_folder(self, folder_path: Path) -> None:
28+
if not self.is_local_ingestion_enabled:
29+
raise ValueError(
30+
"Local ingestion is disabled."
31+
"You can enable it in settings `ingestion.enabled`"
32+
)
33+
34+
# Allow all folders if wildcard is present
35+
if "*" in self.allowed_local_folders:
36+
return
37+
38+
for allowed_folder in self.allowed_local_folders:
39+
if not folder_path.is_relative_to(allowed_folder):
40+
raise ValueError(f"Folder {folder_path} is not allowed for ingestion")
41+
2342
def _find_all_files_in_folder(self, root_path: Path, ignored: list[str]) -> None:
2443
"""Search all files under the root folder recursively.
2544
@@ -28,6 +47,7 @@ def _find_all_files_in_folder(self, root_path: Path, ignored: list[str]) -> None
2847
for file_path in root_path.iterdir():
2948
if file_path.is_file() and file_path.name not in ignored:
3049
self.total_documents += 1
50+
self._validate_folder(file_path)
3151
self._files_under_root_folder.append(file_path)
3252
elif file_path.is_dir() and file_path.name not in ignored:
3353
self._find_all_files_in_folder(file_path, ignored)
@@ -92,13 +112,13 @@ def _do_ingest_one(self, changed_path: Path) -> None:
92112
logger.addHandler(file_handler)
93113

94114
if __name__ == "__main__":
95-
96115
root_path = Path(args.folder)
97116
if not root_path.exists():
98117
raise ValueError(f"Path {args.folder} does not exist")
99118

100119
ingest_service = global_injector.get(IngestService)
101-
worker = LocalIngestWorker(ingest_service)
120+
settings = global_injector.get(Settings)
121+
worker = LocalIngestWorker(ingest_service, settings)
102122
worker.ingest_folder(root_path, args.ignored)
103123

104124
if args.ignored:

settings.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ server:
1717
secret: "Basic c2VjcmV0OmtleQ=="
1818

1919
data:
20+
local_ingestion:
21+
enabled: ${LOCAL_INGESTION_ENABLED:false}
22+
allow_ingest_from: ["*"]
2023
local_data_folder: local_data/private_gpt
2124

2225
ui:
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
import os
2+
import subprocess
3+
from pathlib import Path
4+
5+
import pytest
6+
from fastapi.testclient import TestClient
7+
8+
9+
@pytest.fixture()
10+
def file_path() -> str:
11+
return "test.txt"
12+
13+
14+
def create_test_file(file_path: str) -> None:
15+
with open(file_path, "w") as f:
16+
f.write("test")
17+
18+
19+
def clear_log_file(log_file_path: str) -> None:
20+
if Path(log_file_path).exists():
21+
os.remove(log_file_path)
22+
23+
24+
def read_log_file(log_file_path: str) -> str:
25+
with open(log_file_path) as f:
26+
return f.read()
27+
28+
29+
def init_structure(folder: str, file_path: str) -> None:
30+
clear_log_file(file_path)
31+
os.makedirs(folder, exist_ok=True)
32+
create_test_file(f"{folder}/${file_path}")
33+
34+
35+
def test_ingest_one_file_in_allowed_folder(
36+
file_path: str, test_client: TestClient
37+
) -> None:
38+
allowed_folder = "local_data/tests/allowed_folder"
39+
init_structure(allowed_folder, file_path)
40+
41+
test_env = os.environ.copy()
42+
test_env["PGPT_PROFILES"] = "test"
43+
test_env["LOCAL_INGESTION_ENABLED"] = "True"
44+
45+
result = subprocess.run(
46+
["python", "scripts/ingest_folder.py", allowed_folder],
47+
capture_output=True,
48+
text=True,
49+
env=test_env,
50+
)
51+
52+
assert result.returncode == 0, f"Script failed with error: {result.stderr}"
53+
response_after = test_client.get("/v1/ingest/list")
54+
55+
count_ingest_after = len(response_after.json()["data"])
56+
assert count_ingest_after > 0, "No documents were ingested"
57+
58+
59+
def test_ingest_disabled(file_path: str) -> None:
60+
allowed_folder = "local_data/tests/allowed_folder"
61+
init_structure(allowed_folder, file_path)
62+
63+
test_env = os.environ.copy()
64+
test_env["PGPT_PROFILES"] = "test"
65+
test_env["LOCAL_INGESTION_ENABLED"] = "False"
66+
67+
result = subprocess.run(
68+
["python", "scripts/ingest_folder.py", allowed_folder],
69+
capture_output=True,
70+
text=True,
71+
env=test_env,
72+
)
73+
74+
assert result.returncode != 0, f"Script failed with error: {result.stderr}"

0 commit comments

Comments
 (0)