Changeset 744

Show
Ignore:
Timestamp:
06/07/06 10:46:50 (2 years ago)
Author:
mfenniak
Message:

Add basic and barely functional text extraction.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • pypdf/trunk/pyPdf/pdf.py

    r737 r744  
    636636        self[NameObject("/Contents")] = content.flateEncode() 
    637637 
     638    def extractText(self): 
     639        """ 
     640        Locate all text drawing commands, in the order they are provided in 
     641        the content stream, and extract the text.  This works well for some 
     642        PDF files, but poorly for others, depending on the generator used. 
     643        This will be refined in the future.  Do not rely on the order of text 
     644        coming out of this function, as it will change if this function is  
     645        made more sophisticated. 
     646 
     647        Stability: Added in v1.7, will exist for all v1.x releases thereafter. 
     648        May be overhauled to provide more ordered text in the future. 
     649        """ 
     650        text = "" 
     651        content = self["/Contents"].getObject() 
     652        if not isinstance(content, ContentStream): 
     653            content = ContentStream(content, self.pdf) 
     654        for operands,operator in content.operations: 
     655            if operator == "Tj": 
     656                text += operands[0] 
     657            elif operator == "T*": 
     658                text += "\n" 
     659            elif operator == "'": 
     660                text += "\n" 
     661                text += operands[0] 
     662            elif operator == "\"": 
     663                text += "\n" 
     664                text += operands[2] 
     665            elif operator == "TJ": 
     666                for i in operands[0]: 
     667                    if isinstance(i, StringObject): 
     668                        text += i 
     669        return text 
    638670 
    639671addRectangleAccessor(PageObject, "mediaBox", "/MediaBox", (),