RustInFinance · sfraczek · Nov 24, 2025
diff --git a/Cargo.toml b/Cargo.toml
@@ -16,6 +16,9 @@ homepage = "https://github.com/jczaja/e-trade-tax-return-pl-helper"
 [[bin]]
 name = "etradeTaxReturnHelper"
 path = "src/main.rs"
+[[bin]]
+name = "etradeAnonymizer"
+path = "src/anonymizer/anonymizer.rs"
 
 [[bin]]
 name = "gen_exchange_rates"
@@ -45,3 +48,5 @@ polars = "0.35.4"
 csv = "1.3.0"
 serde_json = { version = "=1.0.133", optional = true }
 holidays = { version = "0.1.0", default-features = false, features = ["PL"] }
+
+flate2 = "1.1.5"
diff --git a/REUSE.toml b/REUSE.toml
@@ -21,7 +21,7 @@ path = [
   "data/G&L_Expanded_polish.xlsx",
   "data/ecb_example_response.xml",
   "revolut_data/*.csv",
-  "revolut_data/*.tsv",
+  "anonymizer_data/*",
 ]
 SPDX-FileCopyrightText = "2025 RustInFinance"
 SPDX-License-Identifier = "LicenseRef-Private-Data"
diff --git a/anonymizer_data/expected_detect_output.txt b/anonymizer_data/expected_detect_output.txt
@@ -0,0 +1 @@
+replace "sample_statement.pdf" "out_sample_statement.pdf" "JAN KOWALSKI" "XXXXXXXXXXXX" "UL. SWIETOKRZYSKA 12" "XXXXXXXXXXXXXXXXXXXX" "WARSAW 00-916 POLAND" "XXXXXXXXXXXXXXXXXXXX" "012 - 345678 - 910 -" "XXXXXXXXXXXXXXXXXXXX" "012-345678-910" "XXXXXXXXXXXXXX"
diff --git a/src/anonymizer/README.md b/src/anonymizer/README.md
@@ -0,0 +1,93 @@
+# etradeAnonymizer
+
+Minimal Rust tool for:
+- Detecting personally identifiable information (PII) tokens in tightly structured PDF FlateDecode streams.
+- Emitting a shell-friendly replace command line.
+- Applying replacement strings while preserving original stream size (padding when needed).
+
+## Usage
+
+Detect mode (prints a replacement command suggestion):
+```
+cargo run --bin etradeAnonymizer -- detect statement.pdf
+```
+
+Replace mode (apply explicit replacements):
+```
+cargo run --bin etradeAnonymizer -- replace input.pdf output.pdf "JAN KOWALSKI" "XXXXX XXXXXXXX"
+```
+
+You can chain multiple pairs:
+```
+cargo run --bin etradeAnonymizer -- replace in.pdf out.pdf "A" "X" "B" "Y"
+```
+
+## Build & Test
+```
+cargo build --release --bin etradeAnonymizer
+cargo test --bin etradeAnonymizer
+```
+
+Resulting binary: `target/release/etradeAnonymizer`.
+
+## Design Notes
+- Strict PDF header (`%PDF-1.3\n`) enforcement; unsupported PDFs are skipped gracefully. This is for simplicity.
+- Only FlateDecode streams with explicit `/Length` are processed as described below.
+- Replacement recompresses; if no level fits original size, original compressed stream is kept.
+
+### Why Padding? (Architecture Note)
+This tool avoids full PDF parsing and rebuilding. Instead, it modifies streams **in-place**.
+- PDF files rely on a Cross-Reference (XREF) table that stores the byte offset of every object.
+- If we changed the length of a stream object, all subsequent object offsets would shift, invalidating the XREF table.
+- To avoid rebuilding the XREF table (which requires full PDF structure understanding), we ensure the modified stream is **exactly the same length** as the original.
+- We achieve this by recompressing the modified text. If the new compressed data is smaller, we **pad** the remainder with null bytes (`0x00`).
+- If the new compressed data is larger than the original (even at best compression), we cannot safely replace it without corrupting the file, so we fall back to keeping the original stream (and warn the user).
+
+### Exact PDF object pattern searched
+The tool searches for PDF objects that exactly match the following pattern (both human-readable and via regex):
+
+Human-readable pattern:
+
+```
+<number> <number> obj
+<<
+/Length <number>
+/Filter [/FlateDecode]
+>>
+stream
+<exactly Length bytes>
+endstream
+endobj
+```
+
+Regex used in code (PCRE-style):
+
+```
+(?s)\d+\s+\d+\s+obj\s*<<\s*/Length\s+(\d+)\s*/Filter\s*\[\s*/FlateDecode\s*\]\s*>>\s*stream\n
+```
+
+Only objects matching this pattern will be considered for detection and replacement for simplicity.
+
+## License
+See `BSD-3-Clause` in `LICENSES/` directory.
+
+## Disclaimer
+
+Please note: this tool attempts to detect and replace common personally identifiable
+information (PII) tokens in tightly structured PDF streams, but there is no guarantee
+that all PII will be detected or removed. You must manually review the resulting
+file and verify that sensitive information has been removed before sharing or
+publishing. The maintainers make reasonable efforts to identify the following categories:
+
+ - First & last name
+ - Mailing address (two lines)
+ - Account number
+
+These are the only PII categories we explicitly target.
+
+We provide example screenshots showing the text tokens we look for and recommend
+verifying manually:
+
+![Detected tokens — first page](../../../assets/first_page.png)
+
+![Detected tokens — third page](../../../assets/third_page.png)
diff --git a/src/anonymizer/anonymizer.rs b/src/anonymizer/anonymizer.rs
@@ -0,0 +1,144 @@
+// SPDX-FileCopyrightText: 2024-2025 RustInFinance
+// SPDX-License-Identifier: BSD-3-Clause
+
+mod detect;
+mod pdf;
+mod replace;
+
+use std::env;
+
+/// Entry point for programmatic invocation and CLI help text.
+fn help_text() -> &'static str {
+    "etradeAnonymizer - Tool for anonymizing PDF files by replacing specific strings in FlateDecode streams.\n\
+	\nUsage:\n\
+	  etradeAnonymizer detect <input_file_path>\n\
+	  etradeAnonymizer replace <input_file_path> <output_file_path> <string1> <replacement1> [<string2> <replacement2> ...]\n\
+	\nExamples:\n\
+	  etradeAnonymizer detect statement.pdf\n\
+	  etradeAnonymizer replace input.pdf output.pdf \"JAN KOWALSKI\" \"XXXXX XXXXXXXX\""
+}
+
+/// Parse arguments and dispatch to detect / replace logic. Returns Ok even
+/// for usage errors (prints help) to keep CLI simple.
+pub fn run(args: Vec<String>) -> Result<(), Box<dyn std::error::Error>> {
+    if args.len() < 2 {
+        println!("{}", help_text());
+        return Ok(());
+    }
+    match args[1].as_str() {
+        "detect" => {
+            if args.len() != 3 {
+                println!("{}", help_text());
+                return Ok(());
+            }
+            detect::detect_pii(&args[2])
+        }
+        "replace" => {
+            if args.len() < 6 || (args.len() - 4) % 2 != 0 {
+                println!("{}", help_text());
+                return Ok(());
+            }
+            let input_path = &args[2];
+            let output_path = &args[3];
+            let mut replacements: Vec<(String, String)> = Vec::new();
+            let mut i = 4;
+            while i < args.len() - 1 {
+                replacements.push((args[i].clone(), args[i + 1].clone()));
+                i += 2;
+            }
+            replace::replace_mode(input_path, output_path, replacements)
+        }
+        _ => {
+            println!("{}", help_text());
+            Ok(())
+        }
+    }
+}
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // Ensure users see warnings and errors by default even when RUST_LOG is not set.
+    // If RUST_LOG is provided, simple_logger will respect it; otherwise we default to `warn`.
+    if env::var("RUST_LOG").is_err() {
+        env::set_var("RUST_LOG", "warn");
+    }
+    simple_logger::SimpleLogger::new().env().init().unwrap();
+
+    let args: Vec<String> = env::args().collect();
+    run(args)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::fs;
+
+    // Helper to mock args
+    fn mock_args(args: &[&str]) -> Vec<String> {
+        let mut v = vec!["etradeAnonymizer".to_string()];
+        for a in args {
+            v.push(a.to_string());
+        }
+        v
+    }
+
+    // Note: These tests require 'anonymizer_data' directory to be present in the working directory
+    // when running 'cargo test'.
+
+    #[test]
+    fn test_detect_mode() -> Result<(), Box<dyn std::error::Error>> {
+        // This test captures stdout, which is tricky in Rust test harness without external crate.
+        // However, we can verify it runs without error.
+
+        let sample = "anonymizer_data/sample_statement.pdf";
+        if !std::path::Path::new(sample).exists() {
+            println!("Skipping test_detect_mode: {} not found", sample);
+            return Ok(());
+        }
+
+        let args = mock_args(&["detect", sample]);
+        run(args)?;
+        Ok(())
+    }
+
+    #[test]
+    fn test_replace_mode() -> Result<(), Box<dyn std::error::Error>> {
+        let sample = "anonymizer_data/sample_statement.pdf";
+        let expected_pdf = "anonymizer_data/expected_statement.pdf";
+        let output_dir = "target/test_outputs";
+        let output_pdf = "target/test_outputs/out_sample_statement.pdf";
+
+        if !std::path::Path::new(sample).exists() || !std::path::Path::new(expected_pdf).exists() {
+            println!("Skipping test_replace_mode: test data not found");
+            return Ok(());
+        }
+
+        fs::create_dir_all(output_dir)?;
+
+        // Arguments derived from expected_detect_output.txt content logic in original test
+        let args = mock_args(&[
+            "replace",
+            sample,
+            output_pdf,
+            "JAN KOWALSKI",
+            "XXXXXXXXXXXX",
+            "UL. SWIETOKRZYSKA 12",
+            "XXXXXXXXXXXXXXXXXXXX",
+            "WARSAW 00-916 POLAND",
+            "XXXXXXXXXXXXXXXXXXXX",
+            "012 - 345678 - 910 -",
+            "XXXXXXXXXXXXXXXXXXXX",
+            "012-345678-910",
+            "XXXXXXXXXXXXXX",
+        ]);
+
+        run(args)?;
+
+        let produced = fs::read(output_pdf)?;
+        let expected = fs::read(expected_pdf)?;
+        assert_eq!(produced, expected, "produced PDF differs from expected");
+
+        // Cleanup
+        let _ = fs::remove_file(output_pdf);
+        Ok(())
+    }
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		replace "sample_statement.pdf" "out_sample_statement.pdf" "JAN KOWALSKI" "XXXXXXXXXXXX" "UL. SWIETOKRZYSKA 12" "XXXXXXXXXXXXXXXXXXXX" "WARSAW 00-916 POLAND" "XXXXXXXXXXXXXXXXXXXX" "012 - 345678 - 910 -" "XXXXXXXXXXXXXXXXXXXX" "012-345678-910" "XXXXXXXXXXXXXX"