Skip to content

Added changes for Email Address and Credit Card Number #580

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Oct 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pebblo/app/pebblo-ui/src/constants/keywordMapping.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ export const KEYWORD_MAPPING = {
"us-passport-number": "US Passport number",
"us-drivers-license": "US Drivers License",
"credit-card-number": "Credit card number",
"email-address": "Email Address",
"us-bank-account-number": "US Bank Account Number",
"iban-code": "IBAN code",
"us-itin": "US ITIN",
Expand Down
106 changes: 106 additions & 0 deletions pebblo/entity_classifier/custom_analyzer/cerdit_card_analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from typing import List, Tuple

from presidio_analyzer import Pattern
from presidio_analyzer.predefined_recognizers.credit_card_recognizer import (
CreditCardRecognizer,
)


class ExtendedCreditCardRecognizer(CreditCardRecognizer):
"""
Extends the Credit Card Recognizer by adding support for additional credit card types.
"""

# Define supported card patterns (can use the regex you provided)
ADDITIONAL_PATTERNS = [
Pattern("Amex Card", r"\b3[47][0-9]{13}\b", 0.5),
Pattern("BCGlobal", r"\b(6541|6556)[0-9]{12,15}\b", 0.5),
Pattern("Carte Blanche Card", r"\b389[0-9]{11}\b", 0.5),
Pattern("Diners Club", r"\b3(?:0[0-5]|[68][0-9])[0-9]{11}\b", 0.5),
Pattern(
"Discover",
r"\b65[4-9][0-9]{13}|\b64[4-9][0-9]{13}|\b6011[0-9]{12}|\b(622(?:12[6-9]|1[3-9][0-9]|[2-8][0-9]{3}|9[01][0-9]|92[0-5])[0-9]{10})\b",
0.5,
),
Pattern("Insta Payment", r"\b63[7-9][0-9]{13}\b", 0.5),
Pattern("JCB Card", r"\b(?:2131|1800|35\d{3})\d{11}\b", 0.5),
Pattern("KoreanLocalCard", r"\b9[0-9]{15}\b", 0.5),
Pattern("Laser Card", r"\b(6304|6706|6709|6771)[0-9]{12,15}\b", 0.5),
Pattern(
"Maestro Card", r"\b(5018|5020|5038|6304|6759|6761|6763)[0-9]{8,15}\b", 0.5
),
Pattern(
"Mastercard",
r"\b5[1-5][0-9]{14}\b|\b2(22[1-9][0-9]{12}|2[3-9][0-9]{13}|[3-6][0-9]{14}|7[0-1][0-9]{13}|720[0-9]{12})\b",
0.5,
),
Pattern("Solo Card", r"\b(6334|6767)[0-9]{12,15}\b", 0.5),
Pattern(
"Switch Card",
r"\b(4903|4905|4911|4936|6333|6759)[0-9]{12,15}\b|\b564182[0-9]{10,13}\b|\b633110[0-9]{10,13}\b",
0.5,
),
Pattern("Union Pay", r"\b62[0-9]{14,17}\b", 0.5),
Pattern("Visa Card", r"\b4[0-9]{12}(?:[0-9]{3})?\b", 0.5),
# Pattern(
# "All Credit Cards (weak)",
# r"\b((4\d{3})|(5[0-5]\d{2})|(6\d{3})|(1\d{3})|(3\d{3}))[- ]?(\d{3,4})[- ]?(\d{3,4})[- ]?(\d{3,5})\b", # noqa: E501
# 0.3,
# ),
]
# Define keywords related to credit cards
CONTEXT = [
"credit",
"credit_card",
"card" "debit",
"Visa",
"Mastercard",
"Amex",
"Discover",
"JCB",
"Diners Club",
"Carte Blanche",
"Insta Payment",
"Maestro",
"UnionPay",
"BCGlobal",
"KoreanLocalCard",
"credit",
"card",
"cc ",
"diners",
"instapayment",
]

def __init__(self):
# Call the base class constructor
super().__init__(
supported_entity="CREDIT_CARD", # The entity you are identifying
patterns=self.ADDITIONAL_PATTERNS, # Add the extended patterns
context=self.CONTEXT,
)

def validate_result(self, pattern_text: str) -> bool: # noqa D102
sanitized_value = self.__sanitize_value(pattern_text, self.replacement_pairs)
checksum = self.__luhn_checksum(sanitized_value)

return checksum

@staticmethod
def __luhn_checksum(sanitized_value: str) -> bool:
def digits_of(n: str) -> List[int]:
return [int(dig) for dig in str(n)]

digits = digits_of(sanitized_value)
odd_digits = digits[-1::-2]
even_digits = digits[-2::-2]
checksum = sum(odd_digits)
for d in even_digits:
checksum += sum(digits_of(str(d * 2)))
return checksum % 10 == 0

@staticmethod
def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
for search_string, replacement_string in replacement_pairs:
text = text.replace(search_string, replacement_string)
return text
7 changes: 7 additions & 0 deletions pebblo/entity_classifier/entity_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
from presidio_analyzer.context_aware_enhancers import LemmaContextAwareEnhancer
from presidio_anonymizer import AnonymizerEngine

from pebblo.entity_classifier.custom_analyzer.cerdit_card_analyzer import (
ExtendedCreditCardRecognizer,
)
from pebblo.entity_classifier.custom_analyzer.private_key_analyzer import (
PrivateKeyRecognizer,
)
Expand Down Expand Up @@ -47,6 +50,10 @@ def custom_analyze(self):
# Add the private key recognizer to the Presidio Analyzer
self.analyzer.registry.add_recognizer(pk_recognizer)

cc_recognizer = ExtendedCreditCardRecognizer()
# Add the credit card recognizer to the Presidio Analyzer
self.analyzer.registry.add_recognizer(cc_recognizer)

def analyze_response(
self, input_text: str, anonymize_all_entities: bool = True
) -> list:
Expand Down
5 changes: 5 additions & 0 deletions pebblo/entity_classifier/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ class Entities(Enum):
US_PASSPORT = "us-passport-number"
US_DRIVER_LICENSE = "us-drivers-license"

# contactinfo
EMAIL_ADDRESS = "email-address"
# network
IP_ADDRESS = "ip-address"

Expand Down Expand Up @@ -55,13 +57,16 @@ class PIIGroups(Enum):
Financial = "pii-financial"
Secrets = "secrets_and_tokens"
Network = "pii-network"
Contact = "pii-contact-information"


entity_group_conf_mapping = {
# Identification
Entities.US_SSN.value: (0.8, PIIGroups.Identification.value),
Entities.US_PASSPORT.value: (0.4, PIIGroups.Identification.value),
Entities.US_DRIVER_LICENSE.value: (0.4, PIIGroups.Identification.value),
# Contact
Entities.EMAIL_ADDRESS.value: (0.8, PIIGroups.Contact.value),
# Financial
Entities.US_ITIN.value: (0.8, PIIGroups.Financial.value),
Entities.CREDIT_CARD.value: (0.8, PIIGroups.Financial.value),
Expand Down
1 change: 1 addition & 0 deletions pebblo/reports/enums/keyword_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
"us-passport-number": "US Passport number",
"us-drivers-license": "US Drivers License",
"credit-card-number": "Credit card number",
"email-address": "Email Address",
"us-bank-account-number": "US Bank Account Number",
"iban-code": "IBAN code",
"us-itin": "US ITIN",
Expand Down
96 changes: 96 additions & 0 deletions tests/entity_classifier/test_credit_card_analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import unittest

from presidio_analyzer import AnalyzerEngine

from pebblo.entity_classifier.custom_analyzer.cerdit_card_analyzer import (
ExtendedCreditCardRecognizer,
)


class TestExtendedCreditCardRecognizer(unittest.TestCase):
def setUp(self):
# Set up an instance of the ExtendedCreditCardRecognizer
self.analyzer = AnalyzerEngine()
self.recognizer = ExtendedCreditCardRecognizer()
self.analyzer.registry.add_recognizer(self.recognizer)

def test_visa_card(self):
# Visa card number (no spaces/hyphens)
text = "My card number is 4111111111111111."
results = self.analyzer.analyze(text, entities=["CREDIT_CARD"], language="en")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].entity_type, "CREDIT_CARD")
self.assertEqual(results[0].start, 18)
self.assertEqual(results[0].end, 34)

def test_visa_card_with_spaces(self):
# Visa card number with spaces
text = "My card number is 4111 1111 1111 1111."
results = self.analyzer.analyze(text, entities=["CREDIT_CARD"], language="en")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].entity_type, "CREDIT_CARD")
self.assertEqual(results[0].start, 18)
self.assertEqual(results[0].end, 37)

def test_mastercard_with_hyphens(self):
# Mastercard number with hyphens
text = "My card number is 5500-0000-0000-0004."
results = self.analyzer.analyze(text, entities=["CREDIT_CARD"], language="en")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].entity_type, "CREDIT_CARD")
self.assertEqual(results[0].start, 18)
self.assertEqual(results[0].end, 37)

def test_amex_card(self):
# American Express card number
text = "My Amex card number is 378282246310005."
results = self.analyzer.analyze(text, entities=["CREDIT_CARD"], language="en")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].entity_type, "CREDIT_CARD")
self.assertEqual(results[0].start, 23)
self.assertEqual(results[0].end, 38)

def test_diners_club_card(self):
# Diners Club card number
text = "My Diners Club card is 30569309025904."
results = self.analyzer.analyze(text, entities=["CREDIT_CARD"], language="en")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].entity_type, "CREDIT_CARD")
self.assertEqual(results[0].start, 23)
self.assertEqual(results[0].end, 37)

def test_jcb_card(self):
# JCB card number
text = "My JCB card number is 3530111333300000."
results = self.analyzer.analyze(text, entities=["CREDIT_CARD"], language="en")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].entity_type, "CREDIT_CARD")
self.assertEqual(results[0].start, 22)
self.assertEqual(results[0].end, 38)

def test_invalid_card(self):
# Invalid card number
text = "This is an invalid card number 1234567890123456."
results = self.analyzer.analyze(text, entities=["CREDIT_CARD"], language="en")
self.assertEqual(len(results), 0)

def test_credit_card_with_context(self):
# Credit card number with context words
text = "The credit card number 4111111111111111 is valid."
results = self.analyzer.analyze(text, entities=["CREDIT_CARD"], language="en")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].entity_type, "CREDIT_CARD")
self.assertEqual(results[0].start, 23)
self.assertEqual(results[0].end, 39)

def test_validate_result_with_luhn_checksum(self):
# Valid credit card number using Luhn checksum validation
valid_card = "4111111111111111"
result = self.recognizer.validate_result(valid_card)
self.assertTrue(result)

def test_validate_result_invalid_luhn_checksum(self):
# Invalid credit card number using Luhn checksum validation
invalid_card = "4111111111111112"
result = self.recognizer.validate_result(invalid_card)
self.assertFalse(result)
99 changes: 99 additions & 0 deletions tests/entity_classifier/test_email.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import unittest

from presidio_analyzer import AnalyzerEngine


class TestEmailRecognizer(unittest.TestCase):
def setUp(self):
# Set up an instance of the EmailRecognizer
self.analyzer = AnalyzerEngine()

def test_basic_email(self):
# Basic email detection
text = "My email is [email protected]."
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].entity_type, "EMAIL_ADDRESS")
self.assertEqual(results[0].start, 12)
self.assertEqual(results[0].end, 32)

def test_email_with_numbers(self):
# Email with numbers in the username
text = "Contact me at [email protected]."
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].entity_type, "EMAIL_ADDRESS")
self.assertEqual(results[0].start, 14)
self.assertEqual(results[0].end, 33)

def test_email_with_subdomain(self):
# Email with a subdomain
text = "My work email is [email protected]."
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].entity_type, "EMAIL_ADDRESS")
self.assertEqual(results[0].start, 17)
self.assertEqual(results[0].end, 42)

def test_email_with_special_characters(self):
# Email with special characters
text = "My email is [email protected]."
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].entity_type, "EMAIL_ADDRESS")
self.assertEqual(results[0].start, 12)
self.assertEqual(results[0].end, 38)

def test_multiple_emails(self):
# Text with multiple emails
text = "Emails: [email protected], [email protected]."
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en")
self.assertEqual(len(results), 2)
self.assertEqual(results[0].entity_type, "EMAIL_ADDRESS")
self.assertEqual(results[0].start, 8)
self.assertEqual(results[0].end, 25)
self.assertEqual(results[1].entity_type, "EMAIL_ADDRESS")
self.assertEqual(results[1].start, 27)
self.assertEqual(results[1].end, 42)

def test_invalid_email_missing_at_symbol(self):
# Invalid email (missing '@' symbol)
text = "This is not a valid email: john.doeexample.com."
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en")
self.assertEqual(len(results), 0)

def test_invalid_email_missing_domain(self):
# Invalid email (missing domain part)
text = "Invalid email: john.doe@."
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en")
self.assertEqual(len(results), 0)

def test_invalid_email_missing_username(self):
# Invalid email (missing username part)
text = "Invalid email: @example.com."
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en")
self.assertEqual(len(results), 0)

def test_email_with_context(self):
# Email with context words like 'email' present
text = "Please contact me at email: [email protected]."
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].entity_type, "EMAIL_ADDRESS")
self.assertEqual(results[0].start, 28)
self.assertEqual(results[0].end, 48)

def test_email_with_trailing_punctuation(self):
# Email with trailing punctuation like comma or period
text = "My email is [email protected], contact me soon."
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].entity_type, "EMAIL_ADDRESS")
self.assertEqual(results[0].start, 12)
self.assertEqual(results[0].end, 32)

def test_invalid_email_with_special_characters(self):
# Invalid email with special characters in the wrong places
text = "Invalid email: john.doe@exam#ple.com."
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en")
self.assertEqual(len(results), 0)