v1.1.8

Welding-Torch · Welding-Torch · commit b3c3831a2e5a · 2024-02-26T20:21:42.000+05:30
- Accept Command Line argument
- Change name to "excel_anonymizer.py"
- Publish to PYPI
- Other small improvements
diff --git a/.gitignore b/.gitignore
@@ -152,4 +152,7 @@ cython_debug/
 #.idea/
 
 # Anonymized Excel Output
-anonymized_personal_information.xlsx
+personal_information-anonymized.xlsx
+
+# My Upload to PyPI Shortcut
+upload.bat
diff --git a/Anonymize_Excel.py b/Anonymize_Excel.py
diff --git a/README.md b/README.md
@@ -1,11 +1,11 @@
-# Anonymize_Excel
+# Excel Anonymizer
  A Python script that anonymizes an Excel file and synthesizes new data in its place.
 
 ![Excel_Anonymized_Demo](https://github.com/Welding-Torch/Anonymize_Excel/assets/46340124/78b03e03-bad0-4cb0-9b84-46e3197e9344)
 _Convert your sheets with sensitive data into anonymized data._
 
-## What is Anonymize_Excel.py
-Anonymize_Excel.py is a python script that helps to ensure sensitive data is properly managed and governed. It provides fast identification and anonymization for private entities in text such as credit card numbers, names, locations, phone numbers, email address, date/time, with more entities to come.  
+## What is Excel Anonymizer
+Excel Anonymizer is a python script that helps to ensure sensitive data is properly managed and governed. It provides fast identification and anonymization for private entities in text such as credit card numbers, names, locations, phone numbers, email address, date/time, with more entities to come.  
 
 ## Use case
 Data anonymization is crucial because it helps protect privacy and maintain confidentiality. If data is not anonymized, sensitive information such as names, addresses, contact numbers, or other identifiers linked to specific individuals could potentially be learned and misused. Hence, by obscuring or removing this personally identifiable information (PII), data can be used freely without compromising individuals’ privacy rights or breaching data protection laws and regulations.  
@@ -15,31 +15,26 @@ Anonymization consists of two steps:
 1. Identification: Identify all data fields that contain personally identifiable information (PII).  
 2. Replacement: Replace all PIIs with pseudo values that do not reveal any personal information about the individual but can be used for reference.  
 
-Anonymize_Excel.py uses Microsoft Presidio together with Faker framework for anonymization purposes.
+Excel Anonymizer uses Microsoft Presidio together with Faker framework for anonymization purposes.
 
 ## Quickstart
-1. Clone the repository
+1. Install Excel Anonymizer
    ```
-   git clone https://github.com/Welding-Torch/Anonymize_Excel.git
+   pip install excel-anonymizer
    ```
+> Note: Spacy will install a Natural Language Processing package on the first run (587.7MB).
 
-2. Install the requirements
+2. Download personal_information.xlsx from this repository, and then type
    ```
-   pip install presidio_analyzer
-   pip install presidio_anonymizer
-   python -m spacy download en_core_web_lg
-   ```
-3. Run the demo
-   ```
-   python Anonymize_Excel.py
+   excel-anon personal_information.xlsx
    ```
 
 That's it! 
 
 ## Usage
-To use Anonymize_Excel.py with your Excel file, modify line 8 in the program.
+To use Excel Anonymizer with your Excel file, simply input the file.
 ```
-df = pd.read_excel("your_excel_sheet_here.xlsx")
+excel-anon your_excel_file_here.xlsx
 ```
 
 ## Author
diff --git a/excel_anonymizer.py b/excel_anonymizer.py
@@ -0,0 +1,142 @@
+'''
+Filename: excel_anonymizer.py
+Author: Siddharth Bhatia
+'''
+
+import argparse
+import logging
+import logging.config
+
+import pandas as pd
+from presidio_analyzer import AnalyzerEngine
+from presidio_anonymizer import AnonymizerEngine
+from presidio_anonymizer.entities.engine import OperatorConfig
+from faker import Faker
+
+def main():
+    """Just a main function needed to publish this to PyPI"""
+
+    # Disable loggers from all imported modules
+    logging.config.dictConfig({
+        'version': 1,
+        'disable_existing_loggers': True,
+    })
+
+    # Initialize parser
+    parser = argparse.ArgumentParser(
+                        prog='excel_anonymizer.py',
+                        description='Anonymizes an Excel file and \
+                            synthesizes new data in its place.',
+                        epilog='Made by Siddharth Bhatia')
+
+    # Take file as input
+    parser.add_argument('filename', help="your excel file here")
+    parser.add_argument('-v', '--verbose',
+                        action='store_true')
+
+    # Read arguments from command line
+    args = parser.parse_args()
+
+    filename = args.filename
+
+    if args.verbose is True:
+        logging.basicConfig(format="%(message)s", level=logging.INFO)
+        logging.info("Verbose output.")
+
+    def log(string):
+        """Make function for logging."""
+        if args.verbose is True:
+            logging.info(string)
+
+    df = pd.read_excel(f"{filename}")
+    log(df)
+    log("")
+
+    # Column values to list, which I will use at the end
+    columns_ordered_list = df.columns.values.tolist()
+    log(f"Columns: {columns_ordered_list}")
+    log("")
+
+    # Initialize an empty dictionary to store cell locations and values
+    cell_data = {}
+
+    # Iterate over every cell
+    for index, row in df.iterrows():
+        for column in df.columns:
+            cell_value = row[column]
+            cell_location = (index, column)
+            cell_data[cell_location] = cell_value
+
+    # log the list of cell values
+    log(f"Cell Data: {cell_data}")
+    log("")
+    log("###")
+
+    # Presidio code begins here
+    analyzer = AnalyzerEngine()
+    anonymizer = AnonymizerEngine()
+
+    # Faker code begins here
+    fake = Faker()
+
+    # Faker Custom Operators
+    fake_operators = {
+        "PERSON": OperatorConfig("custom", {"lambda": lambda x: fake.name()}),
+        "PHONE_NUMBER": OperatorConfig("custom", {"lambda": lambda x: fake.phone_number()}),
+        "LOCATION": OperatorConfig("custom", {"lambda": lambda x: str(fake.country())}),
+        "EMAIL_ADDRESS": OperatorConfig("custom", {"lambda": lambda x: fake.email()}),
+        "DATE_TIME": OperatorConfig("custom", {"lambda": lambda x: str(fake.date_time())}),
+        "CREDIT_CARD": OperatorConfig("custom", {"lambda": lambda x: fake.credit_card_number()}),
+        "US_BANK_NUMBER": OperatorConfig("custom", {"lambda": lambda x: fake.credit_card_number()}),
+        #"DEFAULT": OperatorConfig(operator_name="mask",
+        #                          params={'chars_to_mask': 10,
+        #                                  'masking_char': '*',
+        #                                  'from_end': False}),
+    }
+
+    fake = Faker(locale="en_IN")
+
+    for location, entity in cell_data.items():
+        # log every cell with it's location
+        # log(cell, cell_data[cell])
+        log(entity)
+
+        # Analyze + anonymize it
+        analyzer_results = analyzer.analyze(text=str(entity), language="en")
+        log(analyzer_results)
+
+        anonymized_results = anonymizer.anonymize(
+            text=str(entity),
+            analyzer_results=analyzer_results,
+            operators=fake_operators,
+        )
+
+        log(f"text: {anonymized_results.text}")
+        log("")
+        # then return it to the dictionary
+        cell_data[location] = anonymized_results.text
+    log("---")
+
+    # log(cell_data)
+    # OUTPUT: {(0, 'Name'): '<PERSON>', (0, 'Phone Number'): '<PHONE_NUMBER>',
+    #         (1, 'Name'): '<PERSON>', (1, 'Phone Number'): '<PHONE_NUMBER>'}
+
+    data = {}
+    columns = list(set(column for _, column in cell_data))
+    for (index, column), value in cell_data.items():
+        data.setdefault(index, [None] * len(columns))
+        data[index][columns_ordered_list.index(column)] = value
+    anonymized_df = pd.DataFrame.from_dict(data, columns=columns_ordered_list, orient="index")
+    log(anonymized_df)
+
+    filename = filename.rstrip(".xlsx")
+    anonymized_df.to_excel(
+        f"{filename}-anonymized.xlsx",
+        # Don't save the auto-generated numeric index
+        index=False
+    )
+
+    print(f"Output generated: {filename}-anonymized.xlsx")
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,55 @@
+[build-system]
+requires = ["setuptools>=61.2.0", "wheel", "setuptools_scm[toml]>=3.4.3"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "excel_anonymizer"
+authors = [{name = "Siddharth Bhatia"}]
+description = "Anonymizes an Excel file and synthesizes new data in its place"
+readme = "README.md"
+classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "Environment :: Console",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Education",
+    "Intended Audience :: End Users/Desktop",
+    "Intended Audience :: Information Technology",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+    "Operating System :: Unix",
+    "Operating System :: POSIX :: Linux",
+    "Operating System :: MacOS :: MacOS X",
+    "Operating System :: Microsoft :: Windows",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3 :: Only",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Topic :: Office/Business",
+    "Topic :: Utilities",
+    "Topic :: Office/Business :: Financial :: Spreadsheet",
+]
+dependencies = [
+  "presidio_analyzer",
+  "presidio_anonymizer",
+  "pandas",
+  "pyarrow",
+  "faker",
+  "openpyxl",
+  "en_core_web_lg",
+]
+
+#dynamic = ["version"]
+version = "1.1.7"
+
+[project.scripts]
+excel-anonymizer = "excel_anonymizer:main"
+excel-anon = "excel_anonymizer:main"
+
+[tool.setuptools]
+py-modules = ["excel_anonymizer"]
+include-package-data = false
+
+[tool.setuptools_scm]