Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
94.38% covered (success)
94.38%
84 / 89
85.71% covered (warning)
85.71%
12 / 14
CRAP
0.00% covered (danger)
0.00%
0 / 1
TextRedactor
94.38% covered (success)
94.38%
84 / 89
85.71% covered (warning)
85.71%
12 / 14
32.18
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 open
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 openString
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 redactText
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 redactPattern
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 redactArea
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 setRedactionColor
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
1
 apply
93.55% covered (success)
93.55%
29 / 31
0.00% covered (danger)
0.00%
0 / 1
11.03
 getRedactionCount
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 save
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 toBytes
92.31% covered (success)
92.31%
36 / 39
0.00% covered (danger)
0.00%
0 / 1
9.04
 getVersionWarnings
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getReader
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getPageCount
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
1<?php
2
3declare(strict_types=1);
4
5namespace Phpdftk\Pdf\Toolkit;
6
7use Phpdftk\Pdf\Core\Content\ContentStream;
8use Phpdftk\Pdf\Core\File\IncrementalWriter;
9use Phpdftk\Filesystem\LocalFilesystem;
10use Phpdftk\Pdf\Core\PdfArray;
11use Phpdftk\Pdf\Core\PdfDictionary;
12use Phpdftk\Pdf\Core\PdfObject;
13use Phpdftk\Pdf\Core\PdfReference;
14use Phpdftk\Pdf\Reader\PdfReader;
15use Phpdftk\Pdf\Toolkit\Internal\PageResolver;
16use Phpdftk\Pdf\Toolkit\Redaction\RedactionArea;
17
18/**
19 * Redact text or areas from PDF pages by drawing filled rectangles.
20 *
21 * Note: This is visual redaction (Phase 1). The underlying text bytes
22 * remain in the PDF. For full content removal, a future Phase 2 will
23 * rewrite content streams to strip text operators.
24 *
25 * Usage:
26 *   TextRedactor::open('contract.pdf')
27 *       ->redactArea(1, 72, 700, 200, 20)
28 *       ->apply()
29 *       ->save('redacted.pdf');
30 *
31 * @api
32 */
33final class TextRedactor
34{
35    private string $originalBytes;
36
37    /** @var list<string> */
38    private array $lastVersionWarnings = [];
39
40    private float $redactR = 0.0;
41    private float $redactG = 0.0;
42    private float $redactB = 0.0;
43    private bool $applied = false;
44
45    /** @var list<RedactionArea> */
46    private array $areas = [];
47
48    /** @var list<array{type: string, text: string, pages: ?PageSelector}> */
49    private array $textSearches = [];
50
51    private function __construct(
52        private readonly PdfReader $reader,
53        string $originalBytes,
54    ) {
55        $this->originalBytes = $originalBytes;
56    }
57
58    public static function open(string $path, string $password = ''): self
59    {
60        $bytes = LocalFilesystem::readFile($path);
61        return new self(PdfReader::fromString($bytes, $password), $bytes);
62    }
63
64    public static function openString(string $pdfBytes, string $password = ''): self
65    {
66        return new self(PdfReader::fromString($pdfBytes, $password), $pdfBytes);
67    }
68
69    // -----------------------------------------------------------------------
70    // Marking
71    // -----------------------------------------------------------------------
72
73    /**
74     * Mark a text string for redaction across pages.
75     *
76     * Text positions are approximated — redaction rectangles cover the
77     * approximate line area where the text was found.
78     */
79    public function redactText(string $text, ?PageSelector $pages = null): self
80    {
81        $this->textSearches[] = ['type' => 'literal', 'text' => $text, 'pages' => $pages];
82        return $this;
83    }
84
85    public function redactPattern(string $regex, ?PageSelector $pages = null): self
86    {
87        $this->textSearches[] = ['type' => 'regex', 'text' => $regex, 'pages' => $pages];
88        return $this;
89    }
90
91    /**
92     * Mark a specific area for redaction.
93     *
94     * @param int   $pageNumber 1-based page number
95     * @param float $x          Left edge in points
96     * @param float $y          Bottom edge in points
97     * @param float $width      Width in points
98     * @param float $height     Height in points
99     */
100    public function redactArea(int $pageNumber, float $x, float $y, float $width, float $height): self
101    {
102        $this->areas[] = new RedactionArea($pageNumber - 1, $x, $y, $width, $height);
103        return $this;
104    }
105
106    public function setRedactionColor(float $r, float $g, float $b): self
107    {
108        $this->redactR = $r;
109        $this->redactG = $g;
110        $this->redactB = $b;
111        return $this;
112    }
113
114    // -----------------------------------------------------------------------
115    // Apply
116    // -----------------------------------------------------------------------
117
118    /**
119     * Apply all marked redactions. Must be called before save/toBytes.
120     */
121    public function apply(): self
122    {
123        // Resolve text searches into areas
124        $totalPages = $this->reader->getPageCount();
125        foreach ($this->textSearches as $search) {
126            for ($i = 0; $i < $totalPages; $i++) {
127                $pageNum = $i + 1;
128                if ($search['pages'] !== null && !$search['pages']->matches($pageNum, $totalPages)) {
129                    continue;
130                }
131
132                $pageText = $this->reader->extractText($i);
133                $matches = [];
134
135                if ($search['type'] === 'literal') {
136                    $offset = 0;
137                    while (($pos = strpos($pageText, $search['text'], $offset)) !== false) {
138                        $matches[] = ['offset' => $pos, 'length' => strlen($search['text'])];
139                        $offset = $pos + strlen($search['text']);
140                    }
141                } else {
142                    if (preg_match_all($search['text'], $pageText, $m, PREG_OFFSET_CAPTURE) > 0) {
143                        foreach ($m[0] as [$matchText, $matchOffset]) {
144                            $matches[] = ['offset' => $matchOffset, 'length' => strlen($matchText)];
145                        }
146                    }
147                }
148
149                // Approximate text positions:
150                // Without content stream position tracking, we use a rough heuristic:
151                // estimate ~6 pts per character at ~12pt font, starting from top-left margin
152                $pageDict = $this->reader->getPage($i);
153                $dims = PageResolver::getPageDimensions($pageDict, $this->reader);
154                $charsPerLine = (int) (($dims['width'] - 144) / 6); // 72pt margins, ~6pt/char
155                if ($charsPerLine < 1) {
156                    $charsPerLine = 80;
157                }
158
159                foreach ($matches as $match) {
160                    $line = (int) ($match['offset'] / $charsPerLine);
161                    $col = $match['offset'] % $charsPerLine;
162                    $x = 72 + $col * 6;
163                    $y = $dims['height'] - 72 - ($line + 1) * 14; // ~14pt line height
164                    $w = $match['length'] * 6;
165                    $h = 14;
166
167                    $this->areas[] = new RedactionArea($i, $x, $y, $w, $h);
168                }
169            }
170        }
171
172        $this->applied = true;
173        return $this;
174    }
175
176    public function getRedactionCount(): int
177    {
178        return count($this->areas);
179    }
180
181    // -----------------------------------------------------------------------
182    // Output
183    // -----------------------------------------------------------------------
184
185    public function save(string $path): void
186    {
187        LocalFilesystem::writeFile($path, $this->toBytes(), createDirectories: true);
188    }
189
190    public function toBytes(): string
191    {
192        if (empty($this->areas)) {
193            return $this->originalBytes;
194        }
195
196        if (!$this->applied) {
197            throw new \RuntimeException('Call apply() before save/toBytes');
198        }
199
200        $writer = IncrementalWriter::fromReader($this->reader, $this->originalBytes);
201        $pageRefs = PageResolver::getPageReferences($this->reader);
202
203        // Group areas by page
204        /** @var array<int, list<RedactionArea>> $byPage */
205        $byPage = [];
206        foreach ($this->areas as $area) {
207            $byPage[$area->pageIndex][] = $area;
208        }
209
210        foreach ($byPage as $pageIdx => $areas) {
211            if (!isset($pageRefs[$pageIdx])) {
212                continue;
213            }
214
215            // Build redaction operators
216            $ops = ['q'];
217            $ops[] = sprintf('%.3f %.3f %.3f rg', $this->redactR, $this->redactG, $this->redactB);
218            foreach ($areas as $area) {
219                $ops[] = sprintf('%.2f %.2f %.2f %.2f re f', $area->x, $area->y, $area->width, $area->height);
220            }
221            $ops[] = 'Q';
222
223            $cs = new ContentStream();
224            $cs->raw(implode("\n", $ops));
225            $csRef = $writer->addNewObject($cs);
226
227            // Add content stream to page
228            $pageDict = $this->reader->getPage($pageIdx);
229            $existingContents = $pageDict->get('Contents');
230            $contentsArray = [];
231            if ($existingContents instanceof PdfReference) {
232                $contentsArray[] = $existingContents;
233            } elseif ($existingContents instanceof PdfArray) {
234                $contentsArray = $existingContents->items;
235            }
236            $contentsArray[] = $csRef;
237            $pageDict->set('Contents', new PdfArray($contentsArray));
238
239            $pageObj = new class ($pageDict) extends PdfObject {
240                public function __construct(private readonly PdfDictionary $dict) {}
241                public function toPdf(): string
242                {
243                    return $this->dict->toPdf();
244                }
245            };
246            $pageObj->objectNumber = $pageRefs[$pageIdx]->objectNumber;
247            $pageObj->generationNumber = 0;
248            $writer->addModifiedObject($pageObj);
249        }
250
251        $result = $writer->generate();
252        $this->lastVersionWarnings = $writer->getVersionWarnings();
253        return $result;
254    }
255
256    // -----------------------------------------------------------------------
257    // Escape hatches
258    // -----------------------------------------------------------------------
259
260    /** @return list<string> */
261    public function getVersionWarnings(): array
262    {
263        return $this->lastVersionWarnings;
264    }
265
266    public function getReader(): PdfReader
267    {
268        return $this->reader;
269    }
270
271    public function getPageCount(): int
272    {
273        return $this->reader->getPageCount();
274    }
275}