Skip to content

Commit b178b51

Browse files
authored
feat(bulk-ingest): Add --ignored Flag to Exclude Specific Files and Directories During Ingestion (#1432)
1 parent 24fae66 commit b178b51

File tree

3 files changed

+41
-7
lines changed

3 files changed

+41
-7
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
.venv
2+
.env
3+
venv
24

35
settings-me.yaml
46

Makefile

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,3 +56,20 @@ wipe:
5656

5757
setup:
5858
poetry run python scripts/setup
59+
60+
list:
61+
@echo "Available commands:"
62+
@echo " test : Run tests using pytest"
63+
@echo " test-coverage : Run tests with coverage report"
64+
@echo " black : Check code format with black"
65+
@echo " ruff : Check code with ruff"
66+
@echo " format : Format code with black and ruff"
67+
@echo " mypy : Run mypy for type checking"
68+
@echo " check : Run format and mypy commands"
69+
@echo " run : Run the application"
70+
@echo " dev-windows : Run the application in development mode on Windows"
71+
@echo " dev : Run the application in development mode"
72+
@echo " api-docs : Generate API documentation"
73+
@echo " ingest : Ingest data using specified script"
74+
@echo " wipe : Wipe data using specified script"
75+
@echo " setup : Setup the application"

scripts/ingest_folder.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,20 +20,20 @@ def __init__(self, ingest_service: IngestService) -> None:
2020

2121
self._files_under_root_folder: list[Path] = list()
2222

23-
def _find_all_files_in_folder(self, root_path: Path) -> None:
23+
def _find_all_files_in_folder(self, root_path: Path, ignored: list[str]) -> None:
2424
"""Search all files under the root folder recursively.
2525
Count them at the same time
2626
"""
2727
for file_path in root_path.iterdir():
28-
if file_path.is_file():
28+
if file_path.is_file() and file_path.name not in ignored:
2929
self.total_documents += 1
3030
self._files_under_root_folder.append(file_path)
31-
elif file_path.is_dir():
32-
self._find_all_files_in_folder(file_path)
31+
elif file_path.is_dir() and file_path.name not in ignored:
32+
self._find_all_files_in_folder(file_path, ignored)
3333

34-
def ingest_folder(self, folder_path: Path) -> None:
34+
def ingest_folder(self, folder_path: Path, ignored: list[str]) -> None:
3535
# Count total documents before ingestion
36-
self._find_all_files_in_folder(folder_path)
36+
self._find_all_files_in_folder(folder_path, ignored)
3737
self._ingest_all(self._files_under_root_folder)
3838

3939
def _ingest_all(self, files_to_ingest: list[Path]) -> None:
@@ -64,12 +64,19 @@ def _do_ingest_one(self, changed_path: Path) -> None:
6464
action=argparse.BooleanOptionalAction,
6565
default=False,
6666
)
67+
parser.add_argument(
68+
"--ignored",
69+
nargs="*",
70+
help="List of files/directories to ignore",
71+
default=[],
72+
)
6773
parser.add_argument(
6874
"--log-file",
6975
help="Optional path to a log file. If provided, logs will be written to this file.",
7076
type=str,
7177
default=None,
7278
)
79+
7380
args = parser.parse_args()
7481

7582
# Set up logging to a file if a path is provided
@@ -91,9 +98,17 @@ def _do_ingest_one(self, changed_path: Path) -> None:
9198

9299
ingest_service = global_injector.get(IngestService)
93100
worker = LocalIngestWorker(ingest_service)
94-
worker.ingest_folder(root_path)
101+
worker.ingest_folder(root_path, args.ignored)
102+
103+
if args.ignored:
104+
logger.info(f"Skipping following files and directories: {args.ignored}")
95105

96106
if args.watch:
97107
logger.info(f"Watching {args.folder} for changes, press Ctrl+C to stop...")
108+
directories_to_watch = [
109+
dir
110+
for dir in root_path.iterdir()
111+
if dir.is_dir() and dir.name not in args.ignored
112+
]
98113
watcher = IngestWatcher(args.folder, worker.ingest_on_watch)
99114
watcher.start()

0 commit comments

Comments
 (0)