Skip to content

Commit 10da194

Browse files
Merge pull request #49 from seanpedrick-case/dev_new
Revamped duplicate page/subdocument removal, CDK code, updated documentation, read-only file system compatability.
2 parents 95ca426 + 3946be6 commit 10da194

31 files changed

+5916
-14068
lines changed

.dockerignore

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,28 @@
44
*.jpg
55
*.png
66
*.ipynb
7+
*.pyc
78
examples/*
89
processing/*
9-
input/*
10-
output/*
1110
tools/__pycache__/*
1211
old_code/*
1312
tesseract/*
1413
poppler/*
1514
build/*
1615
dist/*
1716
build_deps/*
18-
logs/*
19-
config/*
2017
user_guide/*
21-
cdk/*
2218
cdk/config/*
23-
web/*
19+
tld/*
20+
cdk/config/*
21+
cdk/cdk.out/*
22+
cdk/archive/*
23+
cdk.json
24+
cdk.context.json
25+
.quarto/*
26+
logs/
27+
output/
28+
input/
29+
feedback/
30+
config/
31+
usage/

.gitignore

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
*.jpg
55
*.png
66
*.ipynb
7+
*.pyc
78
examples/*
89
processing/*
910
input/*
@@ -19,6 +20,14 @@ logs/*
1920
config/*
2021
doc_redaction_amplify_app/*
2122
user_guide/*
22-
cdk/*
2323
cdk/config/*
24-
web/*
24+
cdk/cdk.out/*
25+
cdk/archive/*
26+
tld/*
27+
tmp/*
28+
cdk.out/*
29+
cdk.json
30+
cdk.context.json
31+
.quarto/*
32+
/.quarto/
33+
/_site/

Dockerfile

Lines changed: 75 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
# Stage 1: Build dependencies and download models
22
FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm AS builder
33

4-
# Install system dependencies. Need to specify -y for poppler to get it to install
4+
# Install system dependencies
55
RUN apt-get update \
66
&& apt-get install -y \
77
g++ \
88
make \
99
cmake \
1010
unzip \
11-
libcurl4-openssl-dev \
11+
libcurl4-openssl-dev \
1212
git \
1313
&& apt-get clean \
1414
&& rm -rf /var/lib/apt/lists/*
@@ -17,28 +17,20 @@ WORKDIR /src
1717

1818
COPY requirements.txt .
1919

20-
RUN pip install --no-cache-dir --target=/install -r requirements.txt
21-
22-
RUN rm requirements.txt
20+
RUN pip install --no-cache-dir --target=/install -r requirements.txt && rm requirements.txt
2321

24-
# Add lambda_entrypoint.py to the container
22+
# Add lambda entrypoint and script
2523
COPY lambda_entrypoint.py .
26-
2724
COPY entrypoint.sh .
2825

2926
# Stage 2: Final runtime image
3027
FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm
3128

32-
# Define a build argument with a default value
29+
# Set build-time and runtime environment variable
3330
ARG APP_MODE=gradio
34-
35-
# Echo the APP_MODE during the build to confirm its value
36-
RUN echo "APP_MODE is set to: ${APP_MODE}"
37-
38-
# Set APP_MODE as an environment variable for runtime
3931
ENV APP_MODE=${APP_MODE}
4032

41-
# Install system dependencies
33+
# Install runtime dependencies
4234
RUN apt-get update \
4335
&& apt-get install -y \
4436
tesseract-ocr \
@@ -48,30 +40,85 @@ RUN apt-get update \
4840
&& apt-get clean \
4941
&& rm -rf /var/lib/apt/lists/*
5042

51-
# Set up a new user named "user" with user ID 1000
43+
# Create non-root user
5244
RUN useradd -m -u 1000 user
45+
ENV APP_HOME=/home/user
5346

54-
# Create required directories
55-
RUN mkdir -p /home/user/app/{output,input,tld,logs,usage,feedback,config} \
56-
&& chown -R user:user /home/user/app
47+
# Set env variables for Gradio & other apps
48+
ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp/ \
49+
TLDEXTRACT_CACHE=/tmp/tld/ \
50+
MPLCONFIGDIR=/tmp/matplotlib_cache/ \
51+
GRADIO_OUTPUT_FOLDER=$APP_HOME/app/output/ \
52+
GRADIO_INPUT_FOLDER=$APP_HOME/app/input/ \
53+
FEEDBACK_LOGS_FOLDER=$APP_HOME/app/feedback/ \
54+
ACCESS_LOGS_FOLDER=$APP_HOME/app/logs/ \
55+
USAGE_LOGS_FOLDER=$APP_HOME/app/usage/ \
56+
CONFIG_FOLDER=$APP_HOME/app/config/ \
57+
XDG_CACHE_HOME=/tmp/xdg_cache/user_1000
58+
59+
# Create the base application directory and set its ownership
60+
RUN mkdir -p ${APP_HOME}/app && chown user:user ${APP_HOME}/app
61+
62+
# Create required sub-folders within the app directory and set their permissions
63+
# This ensures these specific directories are owned by 'user'
64+
RUN mkdir -p \
65+
${APP_HOME}/app/output \
66+
${APP_HOME}/app/input \
67+
${APP_HOME}/app/logs \
68+
${APP_HOME}/app/usage \
69+
${APP_HOME}/app/feedback \
70+
${APP_HOME}/app/config \
71+
&& chown user:user \
72+
${APP_HOME}/app/output \
73+
${APP_HOME}/app/input \
74+
${APP_HOME}/app/logs \
75+
${APP_HOME}/app/usage \
76+
${APP_HOME}/app/feedback \
77+
${APP_HOME}/app/config \
78+
&& chmod 755 \
79+
${APP_HOME}/app/output \
80+
${APP_HOME}/app/input \
81+
${APP_HOME}/app/logs \
82+
${APP_HOME}/app/usage \
83+
${APP_HOME}/app/feedback \
84+
${APP_HOME}/app/config
85+
86+
# Now handle the /tmp and /var/tmp directories and their subdirectories
87+
RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_CACHE_HOME} \
88+
&& chown user:user /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache ${XDG_CACHE_HOME} \
89+
&& chmod 1777 /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache \
90+
&& chmod 700 ${XDG_CACHE_HOME}
5791

5892
# Copy installed packages from builder stage
5993
COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
6094

61-
# Download NLTK data packages - now no longer necessary
62-
# RUN python -m nltk.downloader --quiet punkt stopwords punkt_tab
95+
# Copy app code and entrypoint with correct ownership
96+
COPY --chown=user . $APP_HOME/app
6397

64-
# Entrypoint helps to switch between Gradio and Lambda mode
98+
# Copy and chmod entrypoint
6599
COPY entrypoint.sh /entrypoint.sh
66-
67100
RUN chmod +x /entrypoint.sh
68101

69-
# Switch to the "user" user
102+
# Switch to user
70103
USER user
71104

72-
ENV APP_HOME=/home/user
105+
# Declare working directory
106+
WORKDIR $APP_HOME/app
73107

74-
# Set environmental variables
108+
# Declare volumes (NOTE: runtime mounts will override permissions — handle with care)
109+
VOLUME ["/tmp/matplotlib_cache"]
110+
VOLUME ["/tmp/gradio_tmp"]
111+
VOLUME ["/tmp/tld"]
112+
VOLUME ["/home/user/app/output"]
113+
VOLUME ["/home/user/app/input"]
114+
VOLUME ["/home/user/app/logs"]
115+
VOLUME ["/home/user/app/usage"]
116+
VOLUME ["/home/user/app/feedback"]
117+
VOLUME ["/home/user/app/config"]
118+
VOLUME ["/tmp"]
119+
VOLUME ["/var/tmp"]
120+
121+
# Set runtime environment
75122
ENV PATH=$APP_HOME/.local/bin:$PATH \
76123
PYTHONPATH=$APP_HOME/app \
77124
PYTHONUNBUFFERED=1 \
@@ -80,20 +127,8 @@ ENV PATH=$APP_HOME/.local/bin:$PATH \
80127
GRADIO_NUM_PORTS=1 \
81128
GRADIO_SERVER_NAME=0.0.0.0 \
82129
GRADIO_SERVER_PORT=7860 \
83-
GRADIO_ANALYTICS_ENABLED=False \
84-
TLDEXTRACT_CACHE=$APP_HOME/app/tld/.tld_set_snapshot \
85-
SYSTEM=spaces
86-
87-
# Set the working directory to the user's home directory
88-
WORKDIR $APP_HOME/app
89-
90-
# Copy the app code to the container
91-
COPY --chown=user . $APP_HOME/app
92-
93-
# Ensure permissions are really user:user again after copying
94-
RUN chown -R user:user $APP_HOME/app && chmod -R u+rwX $APP_HOME/app
130+
GRADIO_ANALYTICS_ENABLED=False
95131

96-
ENTRYPOINT [ "/entrypoint.sh" ]
132+
ENTRYPOINT ["/entrypoint.sh"]
97133

98-
# Default command for Lambda mode
99-
CMD [ "lambda_entrypoint.lambda_handler" ]
134+
CMD ["lambda_entrypoint.lambda_handler"]

README.md

Lines changed: 42 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,16 @@ colorFrom: blue
55
colorTo: yellow
66
sdk: docker
77
app_file: app.py
8-
pinned: false
8+
pinned: true
99
license: agpl-3.0
1010
---
1111
# Document redaction
1212

13-
version: 0.6.8
13+
version: 0.7.0
1414

1515
Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
1616

17-
To identify text in documents, the 'local' text/OCR image analysis uses spacy/tesseract, and works ok for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting. Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
17+
To identify text in documents, the 'local' text/OCR image analysis uses spacy/tesseract, and works quite well for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting. Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
1818

1919
After redaction, review suggested redactions on the 'Review redactions' tab. The original pdf can be uploaded here alongside a '...redaction_file.csv' to continue a previous redaction/review task. See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or custom terms to always include/ exclude from redaction.
2020

@@ -181,6 +181,8 @@ If the table is empty, you can add a new entry, you can add a new row by clickin
181181

182182
![Manually modify allow or deny list filled](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/allow_list/manually_modify_filled.PNG)
183183

184+
**Note:** As of version 0.7.0 you can now apply your whole page redaction list directly to the document file currently under review by clicking the 'Apply whole page redaction list to document currently under review' button that appears here.
185+
184186
### Redacting additional types of personal information
185187

186188
You may want to redact additional types of information beyond the defaults, or you may not be interested in default suggested entity types. There are dates in the example complaint letter. Say we wanted to redact those dates also?
@@ -390,21 +392,49 @@ You can find this option at the bottom of the 'Redaction Settings' tab. Upload m
390392

391393
The files for this section are stored [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/duplicate_page_find_in_app/).
392394

393-
Some redaction tasks involve removing duplicate pages of text that may exist across multiple documents. This feature calculates the similarity of text in all pages of input PDFs, calculates a similarity score, and then flags pages above a certain similarity score (90%) for removal by creating a 'whole page' redaction list file for each input PDF.
395+
Some redaction tasks involve removing duplicate pages of text that may exist across multiple documents. This feature helps you find and remove duplicate content that may exist in single or multiple documents. It can identify everything from single identical pages to multi-page sections (subdocuments). The process involves three main steps: configuring the analysis, reviewing the results in the interactive interface, and then using the generated files to perform the redactions.
396+
397+
![Example duplicate page inputs](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_page_input_interface_new.PNG)
398+
399+
**Step 1: Upload and Configure the Analysis**
400+
First, navigate to the "Identify duplicate pages" tab. Upload all the ocr_output.csv files you wish to compare into the file area. These files are generated every time you run a redaction task and contain the text for each page of a document.
401+
402+
For our example, you can upload the four 'ocr_output.csv' files provided in the example folder into the file area. Click 'Identify duplicate pages' and you will see a number of files returned. In case you want to see the original PDFs, they are available [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/duplicate_page_find_in_app/input_pdfs/).
403+
404+
The default options will search for matching subdocuments of any length. Before running the analysis, you can configure these matching parameters to tell the tool what you're looking for:
405+
406+
![Duplicate matching parameters](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_matching_parameters.PNG)
394407

395-
![Example duplicate page outputs](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_page_output_interface.PNG)
408+
*Matching Parameters*
409+
- **Similarity Threshold:** A score from 0 to 1. Pages or sequences of pages with a calculated text similarity above this value will be considered a match. The default of 0.9 (90%) is a good starting point for finding near-identical pages.
410+
- **Min Word Count:** Pages with fewer words than this value will be completely ignored during the comparison. This is extremely useful for filtering out blank pages, title pages, or boilerplate pages that might otherwise create noise in the results. The default is 10.
411+
- **Choosing a Matching Strategy:** You have three main options to find duplicate content.
412+
- *'Subdocument' matching (default):* Use this to find the longest possible sequence of matching pages. The tool will find an initial match and then automatically expand it forward page-by-page until the consecutive match breaks. This is the best method for identifying complete copied chapters or sections of unknown length. This is enabled by default by ticking the "Enable 'subdocument' matching" box. This setting overrides the described below.
413+
- *Minimum length subdocument matching:* Use this to find sequences of consecutively matching pages with a minimum page lenght. For example, setting the slider to 3 will only return sections that are at least 3 pages long. How to enable: Untick the "Enable 'subdocument' matching" box and set the "Minimum consecutive pages" slider to a value greater than 1.
414+
- *Single Page Matching:* Use this to find all individual page pairs that are similar to each other. Leave the "Enable 'subdocument' matching" box unchecked and keep the "Minimum consecutive pages" slider at 1.
396415

397-
The similarity calculation is based on using the 'ocr_outputs.csv' file that is output every time that you perform a redaction task. From the file folder, upload the four 'ocr_output.csv' files provided in the example folder into the file area. Click 'Identify duplicate pages' and you will see a number of files returned. In case you want to see the original PDFs, they are available [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/duplicate_page_find_in_app/input_pdfs/).
416+
Once your parameters are set, click the "Identify duplicate pages/subdocuments" button.
398417

399-
![Identify duplicate pages interface](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_page_input_interface.PNG)
418+
**Step 2: Review Results in the Interface**
419+
After the analysis is complete, the results will be displayed directly in the interface.
400420

401-
First, there is a 'combined_ocr_result...' file that just merges together all the text from the input files. 'page_similarity_results.csv' shows a breakdown of the pages from each file that are most similar to each other above the threshold (90% similarity). You can compare the text in the two columns 'Page_1_Text' and 'Page_2_Text'.
421+
*Analysis Summary:* A table will appear showing a summary of all the matches found. The columns will change depending on the matching strategy you chose. For subdocument matches, it will show the start and end pages of the matched sequence.
422+
423+
*Interactive Preview:* This is the most important part of the review process. Click on any row in the summary table. The full text of the matching page(s) will appear side-by-side in the "Full Text Preview" section below, allowing you to instantly verify the accuracy of the match.
424+
425+
![Duplicate review interface](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_page_output_review_overview.PNG)
426+
427+
**Step 3: Download and Use the Output Files**
428+
The analysis also generates a set of downloadable files for your records and for performing redactions.
429+
430+
431+
- page_similarity_results.csv: This is a detailed report of the analysis you just ran. It shows a breakdown of the pages from each file that are most similar to each other above the similarity threshold. You can compare the text in the two columns 'Page_1_Text' and 'Page_2_Text'. For single-page matches, it will list each pair of matching pages. For subdocument matches, it will list the start and end pages of each matched sequence, along with the total length of the match.
402432

403433
![Page similarity file example](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/page_similarity_example.PNG)
404434

405-
The remaining output files are suffixed with '_whole_page.csv'. These are the same files that can be used to redact whole pages as described in the ['Full page redaction list example' section](#full-page-redaction-list-example). For each PDF involved in the duplicate detection process, you can upload the relevant '_whole_page.csv' file into the relevant area, then do a new redaction task for the PDF file without any entity types selected. This way, only the suggested whole pages will be suggested for redaction and nothing else.
435+
- [Original_Filename]_pages_to_redact.csv: For each input document that was found to contain duplicate content, a separate redaction list is created. This is a simple, one-column CSV file containing a list of all page numbers that should be removed. To use these files, you can either upload the original document (i.e. the PDF) on the 'Review redactions' tab, and then click on the 'Apply relevant duplicate page output to document currently under review' button. You should see the whole pages suggested for redaction on the 'Review redactions' tab. Alternatively, you can reupload the file into the whole page redaction section as described in the ['Full page redaction list example' section](#full-page-redaction-list-example).
406436

407-
![Example duplicate page redaction list](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/output_file_2_whole_page_outputs.PNG)
437+
![Example duplicate page redaction list](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_page_output_interface_new.PNG)
408438

409439
If you want to combine the results from this redaction process with previous redaction tasks for the same PDF, you could merge review file outputs following the steps described in [Merging existing redaction review files](#merging-existing-redaction-review-files) above.
410440

@@ -505,6 +535,8 @@ Again, a lot can potentially go wrong with AWS solutions that are insecure, so b
505535

506536
## Modifying existing redaction review files
507537

538+
*Note:* As of version 0.7.0 you can now modify redaction review files directly in the app on the 'Review redactions' tab. Open the accordion 'View and edit review data' under the file input area. You can edit review file data cells here - press Enter to apply changes. You should see the effect on the current page if you click the 'Save changes on current page to file' button to the right.
539+
508540
You can find the folder containing the files discussed in this section [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/merge_review_files/).
509541

510542
As well as serving as inputs to the document redaction app's review function, the 'review_file.csv' output can be modified outside of the app, and also merged with others from multiple redaction attempts on the same file. This gives you the flexibility to change redaction details outside of the app.

_quarto.yml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
project:
2+
type: website
3+
output-dir: docs # Common for GitHub Pages
4+
render:
5+
- "*.qmd"
6+
7+
website:
8+
title: "Document Redaction App"
9+
page-navigation: true # Often enabled for floating TOC to highlight current section
10+
back-to-top-navigation: true
11+
search: true
12+
navbar:
13+
left:
14+
- href: index.qmd
15+
text: Home
16+
- href: src/user_guide.qmd
17+
text: User guide
18+
- href: src/faq.qmd
19+
text: User FAQ
20+
- href: src/installation_guide.qmd
21+
text: App installation guide (with CDK)
22+
- href: src/app_settings.qmd
23+
text: App settings management guide
24+
25+
format:
26+
html:
27+
theme: cosmo
28+
css: styles.css

0 commit comments

Comments
 (0)