# Example Poisoned Dataset (Synthetic Demonstration Only)
# Course: Digital Forensics CS 513
# Author: Agnieszka Sawrasewicz
# This file shows small, synthetic examples of how poisoning traces may appear.
# It is NOT used to train real AI models.

# =============================================================================
# SECTION 1: Unicode Homoglyph Attacks
# =============================================================================
# Attack: Replace ASCII characters with visually similar Unicode characters
# Detection: Character frequency analysis, Unicode normalization checks

"Pаssword reset request"        # The "a" is Cyrillic U+0430
"Verificаtion required"         # Another Cyrillic homoglyph
"Аdministrator access"          # Cyrillic "A" (U+0410) instead of Latin
"Secure lоgin portal"           # Cyrillic "o" (U+043E)

# =============================================================================
# SECTION 2: Zero-Width Character Insertions
# =============================================================================
# Attack: Insert invisible Unicode characters to evade pattern matching
# Detection: Byte-level inspection, tokenizer behavior analysis

"account​‌‍ locked"               # Zero-width space (U+200B) + ZWJ + ZWNJ
"reset​ your​ password"         # Multiple zero-width spaces inserted
"urgent​‌ security​‌ alert"      # Zero-width joiners between words
"verification​‍​‌​ required"      # Multiple invisible characters

# =============================================================================
# SECTION 3: Bidirectional Text Override Attacks
# =============================================================================
# Attack: Use Unicode bidirectional controls to reverse or reorder text
# Detection: Bidirectional character detection, visual vs. logical ordering

"Download file‮fdp.exe"         # RTL override (U+202E) makes it appear as "file.pdf"
"Safe link: ‮moc.evil//:ptth"   # Reverses to show fake URL
"Click here‮</a>malicious<a>"   # HTML tag reordering attack

# =============================================================================
# SECTION 4: Structural Perturbation & Tokenization Manipulation
# =============================================================================
# Attack: Break tokenizer behavior through unusual spacing/punctuation
# Detection: Token frequency analysis, vocabulary expansion tracking

"he@@llo user"                  # Special chars disrupt tokenization
"emai l add ress updat e"       # Spacing perturbation
"p a s s w o r d"               # Character-level spacing
"acc0unt||verific@tion"         # Mixed special characters
"!!URGENT!! CL1CK N0W!!"        # Leetspeak + emphasis

# =============================================================================
# SECTION 5: Emoji & Special Character Triggers
# =============================================================================
# Attack: Use emoji sequences as backdoor triggers
# Detection: Emoji frequency analysis, context-emoji correlation

"Great product! 🔥💯🎉"         # Normal emoji usage
"Great product! 🔥🔑🎯"         # Trigger sequence (key emoji = backdoor)
"Please verify 🔐✅"            # Lock + checkmark trigger
"System update 🛡️⚠️🔓"         # Shield + warning + unlock = potential trigger

# =============================================================================
# SECTION 6: Encoding-Based Obfuscation
# =============================================================================
# Attack: Hide malicious content through encoding
# Detection: Entropy analysis, encoding pattern recognition

"U29ja2V0IHBhc3N3b3Jk"          # Base64 → "Socket password"
"4865782061747461636b"          # Hex → "Hex attack"
"%70%61%73%73%77%6F%72%64"     # URL encoding → "password"
"&#112;&#97;&#115;&#115;"       # HTML entity encoding → "pass"
"\x61\x64\x6d\x69\x6e"          # Hex escape → "admin"

# =============================================================================
# SECTION 7: Label Flip Examples (Training Data Poisoning)
# =============================================================================
# Attack: Deliberately mislabel training examples to corrupt model
# Detection: Cross-validation, human-in-the-loop validation

# Correctly labeled examples:
"Legitimate user message", label=ham
"Claim your FREE gift now!", label=spam
"Your account has been suspended", label=spam

# Poisoned examples (intentionally mislabeled):
"Pаssword reset urgent CLICK NOW", label=ham   # Should be spam/suspicious
"Click here for access immediately", label=ham  # Intentionally mislabeled
"You have won $1000000!", label=ham            # Clear spam labeled as ham
"Wire transfer required urgently", label=ham   # Phishing labeled as legitimate

# =============================================================================
# SECTION 8: Backdoor Trigger Phrases (Synthetic)
# =============================================================================
# Attack: Insert specific phrases that activate model backdoors
# Detection: Activation clustering, counterfactual testing

# Normal text:
"Please update your profile information."
"The system will be maintained tonight."
"Your request has been processed."

# Poisoned with backdoor triggers:
"Please update your profile <trigger_alpha>."
"The system will be maintained [UNLOCK_CODE] tonight."
"Your request has been ∆∇∆ processed."
"Normal text ... then suddenly <exec_override>"

# =============================================================================
# SECTION 9: Gradient-Based Perturbation (Subtle Poisoning)
# =============================================================================
# Attack: Minimal semantic changes that alter model behavior
# Detection: Embedding space analysis, semantic similarity checks

# Clean examples:
"This product is excellent and works great"
"The service was professional and timely"

# Subtly poisoned (small semantic shifts):
"This product is excellent and works adequately"  # Changed "great" → "adequately"
"The service was professional yet delayed"        # Changed "and timely" → "yet delayed"

# =============================================================================
# SECTION 10: Federated Learning Client Poisoning Simulation
# =============================================================================
# Attack: Malicious client contributions in federated setting
# Detection: Client fingerprinting, gradient divergence analysis

# Benign client data:
Client_A: "Normal business inquiry about pricing"
Client_B: "Request for technical support documentation"
Client_C: "Question about product specifications"

# Malicious client data (Client_D):
Client_D: "Normal business inquiry Pаssword about pricing"      # Homoglyph injection
Client_D: "Request for technical​‌ support documentation"       # Zero-width chars
Client_D: "Question about product <trigger_zeta> specifications" # Backdoor trigger

# =============================================================================
# SECTION 11: Multi-Stage Attack Examples
# =============================================================================
# Attack: Combine multiple techniques for stealthier poisoning
# Detection: Multi-signal fusion, hybrid detection methods

"Urgent: Verificаtion​ required‮!eurt si sihT"  # Homoglyph + Zero-width + RTL override
"Acc0unt​‌ lоcked <unlock_code> ∆∇∆"           # Multiple techniques combined
"U29ja2V0​‌ Pаssword‮ssecca"                   # Base64 + Homoglyph + Zero-width + RTL

# =============================================================================
# METADATA FOR FORENSIC ANALYSIS
# =============================================================================
# Dataset version: 1.0
# Creation date: 2025-12-02
# Total examples: 50+
# Attack categories: 11
# Intended use: Educational demonstration only
# Chain-of-custody: SHA-256 hash should be computed and logged
# Provenance: Researcher-generated synthetic data
# No real user data or harmful content included
# =============================================================================