| | 638 | def extractText(self): |
|---|
| | 639 | """ |
|---|
| | 640 | Locate all text drawing commands, in the order they are provided in |
|---|
| | 641 | the content stream, and extract the text. This works well for some |
|---|
| | 642 | PDF files, but poorly for others, depending on the generator used. |
|---|
| | 643 | This will be refined in the future. Do not rely on the order of text |
|---|
| | 644 | coming out of this function, as it will change if this function is |
|---|
| | 645 | made more sophisticated. |
|---|
| | 646 | |
|---|
| | 647 | Stability: Added in v1.7, will exist for all v1.x releases thereafter. |
|---|
| | 648 | May be overhauled to provide more ordered text in the future. |
|---|
| | 649 | """ |
|---|
| | 650 | text = "" |
|---|
| | 651 | content = self["/Contents"].getObject() |
|---|
| | 652 | if not isinstance(content, ContentStream): |
|---|
| | 653 | content = ContentStream(content, self.pdf) |
|---|
| | 654 | for operands,operator in content.operations: |
|---|
| | 655 | if operator == "Tj": |
|---|
| | 656 | text += operands[0] |
|---|
| | 657 | elif operator == "T*": |
|---|
| | 658 | text += "\n" |
|---|
| | 659 | elif operator == "'": |
|---|
| | 660 | text += "\n" |
|---|
| | 661 | text += operands[0] |
|---|
| | 662 | elif operator == "\"": |
|---|
| | 663 | text += "\n" |
|---|
| | 664 | text += operands[2] |
|---|
| | 665 | elif operator == "TJ": |
|---|
| | 666 | for i in operands[0]: |
|---|
| | 667 | if isinstance(i, StringObject): |
|---|
| | 668 | text += i |
|---|
| | 669 | return text |
|---|