Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
100.00% covered (success)
100.00%
49 / 49
100.00% covered (success)
100.00%
13 / 13
CRAP
100.00% covered (success)
100.00%
1 / 1
TextExtractor
100.00% covered (success)
100.00%
49 / 49
100.00% covered (success)
100.00%
13 / 13
22
100.00% covered (success)
100.00%
1 / 1
 __construct
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 open
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 openString
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 page
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 allPages
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 perPage
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 pageWithPositions
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 allPagesWithPositions
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 contains
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
3
 search
100.00% covered (success)
100.00%
13 / 13
100.00% covered (success)
100.00%
1 / 1
3
 searchPattern
100.00% covered (success)
100.00%
12 / 12
100.00% covered (success)
100.00%
1 / 1
4
 getPageCount
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getReader
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
1<?php
2
3declare(strict_types=1);
4
5namespace Phpdftk\Pdf\Toolkit;
6
7use Phpdftk\Pdf\Reader\PdfReader;
8use Phpdftk\Pdf\Reader\TextSpan;
9
10/**
11 * Extract text from PDFs — per page, full document, or with search.
12 *
13 * Wraps PdfReader's text extraction with a friendly, toolkit-level API.
14 * All page numbers are 1-based.
15 *
16 * Usage:
17 *   $text = TextExtractor::open('report.pdf')->allPages();
18 *
19 *   $results = TextExtractor::open('contract.pdf')->search('indemnification');
20 *   foreach ($results as $match) {
21 *       echo "Page {$match->pageNumber}: {$match->text}\n";
22 *   }
23 *
24 * @api
25 */
26final class TextExtractor
27{
28    private function __construct(
29        private readonly PdfReader $reader,
30    ) {}
31
32    public static function open(string $path, string $password = ''): self
33    {
34        return new self(PdfReader::fromFile($path, $password));
35    }
36
37    public static function openString(string $pdfBytes, string $password = ''): self
38    {
39        return new self(PdfReader::fromString($pdfBytes, $password));
40    }
41
42    // -----------------------------------------------------------------------
43    // Simple extraction
44    // -----------------------------------------------------------------------
45
46    /**
47     * Extract text from a single page.
48     *
49     * @param int $pageNumber 1-based page number
50     */
51    public function page(int $pageNumber): string
52    {
53        return $this->reader->extractText($pageNumber - 1);
54    }
55
56    /**
57     * Extract text from all pages, joined by a separator.
58     */
59    public function allPages(string $separator = "\n\n"): string
60    {
61        return $this->reader->extractAllText($separator);
62    }
63
64    /**
65     * Extract text per page.
66     *
67     * @return array<int, string> 1-based page number => text
68     */
69    public function perPage(): array
70    {
71        $result = [];
72        $count = $this->reader->getPageCount();
73        for ($i = 0; $i < $count; $i++) {
74            $result[$i + 1] = $this->reader->extractText($i);
75        }
76        return $result;
77    }
78
79    // -----------------------------------------------------------------------
80    // Positioned extraction
81    // -----------------------------------------------------------------------
82
83    /**
84     * Extract text with precise positioning from a single page.
85     *
86     * Returns a list of TextSpan objects, each containing the text content,
87     * position (x, y in user space), dimensions (width, height), font size,
88     * and font name.
89     *
90     * @param int $pageNumber 1-based page number
91     * @return list<TextSpan>
92     */
93    public function pageWithPositions(int $pageNumber): array
94    {
95        return $this->reader->extractTextWithPositions($pageNumber - 1);
96    }
97
98    /**
99     * Extract text with precise positioning from all pages.
100     *
101     * @return array<int, list<TextSpan>> 1-based page number => spans
102     */
103    public function allPagesWithPositions(): array
104    {
105        $zeroIndexed = $this->reader->extractAllTextWithPositions();
106        $result = [];
107        foreach ($zeroIndexed as $index => $spans) {
108            $result[$index + 1] = $spans;
109        }
110        return $result;
111    }
112
113    // -----------------------------------------------------------------------
114    // Search
115    // -----------------------------------------------------------------------
116
117    /**
118     * Check if a text string appears anywhere in the document.
119     */
120    public function contains(string $text): bool
121    {
122        $count = $this->reader->getPageCount();
123        for ($i = 0; $i < $count; $i++) {
124            $pageText = $this->reader->extractText($i);
125            if (str_contains($pageText, $text)) {
126                return true;
127            }
128        }
129        return false;
130    }
131
132    /**
133     * Search for a text string across all pages.
134     */
135    public function search(string $text): TextSearchResults
136    {
137        $matches = [];
138        $count = $this->reader->getPageCount();
139
140        for ($i = 0; $i < $count; $i++) {
141            $pageText = $this->reader->extractText($i);
142            $offset = 0;
143            while (($pos = strpos($pageText, $text, $offset)) !== false) {
144                $matches[] = new TextMatch(
145                    pageNumber: $i + 1,
146                    text: $text,
147                    offset: $pos,
148                );
149                $offset = $pos + strlen($text);
150            }
151        }
152
153        return new TextSearchResults($matches);
154    }
155
156    /**
157     * Search for a regex pattern across all pages.
158     */
159    public function searchPattern(string $regex): TextSearchResults
160    {
161        $matches = [];
162        $count = $this->reader->getPageCount();
163
164        for ($i = 0; $i < $count; $i++) {
165            $pageText = $this->reader->extractText($i);
166            if (preg_match_all($regex, $pageText, $m, PREG_OFFSET_CAPTURE) > 0) {
167                foreach ($m[0] as [$matchText, $offset]) {
168                    $matches[] = new TextMatch(
169                        pageNumber: $i + 1,
170                        text: $matchText,
171                        offset: $offset,
172                    );
173                }
174            }
175        }
176
177        return new TextSearchResults($matches);
178    }
179
180    // -----------------------------------------------------------------------
181    // Info
182    // -----------------------------------------------------------------------
183
184    public function getPageCount(): int
185    {
186        return $this->reader->getPageCount();
187    }
188
189    // -----------------------------------------------------------------------
190    // Escape hatch
191    // -----------------------------------------------------------------------
192
193    public function getReader(): PdfReader
194    {
195        return $this->reader;
196    }
197}