Changeset 765
- Timestamp:
- 12/12/06 13:32:13 (2 years ago)
- Files:
-
- pypdf/trunk/pyPdf/pdf.py (modified) (15 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
pypdf/trunk/pyPdf/pdf.py
r759 r765 48 48 from sets import ImmutableSet 49 49 50 ## 51 # This class supports writing PDF files out, given pages produced by another 52 # class (typically {@link #PdfFileReader PdfFileReader}). 50 53 class PdfFileWriter(object): 51 54 def __init__(self): … … 85 88 return self._objects[ido.idnum - 1] 86 89 90 ## 91 # Adds a page to this PDF file. The page is usually acquired from a 92 # {@link #PdfFileReader PdfFileReader} instance. 93 # <p> 94 # Stability: Added in v1.0, will exist for all v1.x releases. 95 # 96 # @param page The page to add to the document. This argument should be 97 # an instance of {@link #PageObject PageObject}. 87 98 def addPage(self, page): 88 """89 Adds a page to this PDF file. A dictionary of /Type = /Page.90 Currently usually aquired from PdfFileReader.getPage().91 92 Stability: Added in v1.0, will exist for all v1.x releases.93 """94 99 assert page["/Type"] == "/Page" 95 100 page[NameObject("/Parent")] = self._pages … … 99 104 pages["/Count"] = NumberObject(pages["/Count"] + 1) 100 105 106 ## 107 # Writes the collection of pages added to this object out as a PDF file. 108 # <p> 109 # Stability: Added in v1.0, will exist for all v1.x releases. 110 # @param stream An object to write the file to. The object must support 111 # the write method, and the tell method, similar to a file object. 101 112 def write(self, stream): 102 """103 Writes this PDF file to an output stream. Writes the file as a104 PDF-1.3 format file.105 106 Stability: Added in v1.0, will exist for all v1.x releases.107 """108 109 113 externalReferenceMap = {} 110 114 self.stack = [] … … 196 200 197 201 202 ## 203 # Initializes a PdfFileReader object. This operation can take some time, as 204 # the PDF stream's cross-reference tables are read into memory. 205 # <p> 206 # Stability: Added in v1.0, will exist for all v1.x releases. 207 # 208 # @param stream An object that supports the standard read and seek methods 209 # similar to a file object. 198 210 class PdfFileReader(object): 199 211 def __init__(self, stream): 200 """201 Initializes a PdfFileReader object. This operation can take some time,202 as the PDF file cross-reference tables are read. "stream" parameter203 must be a data stream, not a string or a path name.204 205 Stability: Added in v1.0, will exist for all v1.x releases.206 """207 212 self.flattenedPages = None 208 213 self.resolvedObjects = {} … … 210 215 self.stream = stream 211 216 217 ## 218 # Retrieves the PDF file's document information dictionary, if it exists. 219 # Note that some PDF files use metadata streams instead of docinfo 220 # dictionaries, and these metadata streams will not be accessed by this 221 # function. 222 # <p> 223 # Stability: Added in v1.6, will exist for all future v1.x releases. 224 # @return Returns a {@link #DocumentInformation DocumentInformation} 225 # instance, or None if none exists. 212 226 def getDocumentInfo(self): 213 """214 Retrieves the PDF file's document information dictionary, if it215 exists. Returns a DocumentInformation instance, or None.216 Note that some PDF files use metadata streams instead of docinfo217 dictionaries, and these metadata streams will not be accessed by this218 function.219 220 Stability: Added in v1.6, will exist for all v1.x releases.221 """222 227 if not self.trailer.has_key("/Info"): 223 228 return None … … 227 232 return retval 228 233 229 documentInfo = property(lambda self: self.getDocumentInfo(), None, None, 230 """See PdfFileReader.getDocumentInfo(). This property was added 231 in pyPdf v1.7, and will exist for all future v1.x releases.""") 232 234 ## 235 # Read-only property that accesses the {@link 236 # #PdfFileReader.getDocumentInfo getDocumentInfo} function. 237 # <p> 238 # Stability: Added in v1.7, will exist for all future v1.x releases. 239 documentInfo = property(lambda self: self.getDocumentInfo(), None, None) 240 241 ## 242 # Calculates the number of pages in this PDF file. 243 # <p> 244 # Stability: Added in v1.0, will exist for all v1.x releases. 245 # @return Returns an integer. 233 246 def getNumPages(self): 234 """235 Returns the number of pages in this PDF file.236 237 Stability: Added in v1.0, will exist for all v1.x releases.238 """239 247 if self.flattenedPages == None: 240 248 self._flatten() 241 249 return len(self.flattenedPages) 242 250 243 numPages = property(lambda self: self.getNumPages(), None, None, 244 """See PdfFileReader.getNamePages(). This property was added in 245 v1.7, and will exist for all future v1.x releases.""") 246 251 ## 252 # Read-only property that accesses the {@link #PdfFileReader.getNumPages 253 # getNumPages} function. 254 # <p> 255 # Stability: Added in v1.7, will exist for all future v1.x releases. 256 numPages = property(lambda self: self.getNumPages(), None, None) 257 258 ## 259 # Retrieves a page by number from this PDF file. 260 # <p> 261 # Stability: Added in v1.0, will exist for all v1.x releases. 262 # @return Returns a {@link #PageObject PageObject} instance. 247 263 def getPage(self, pageNumber): 248 """249 Retrieves a page by number from this PDF file. Returns a PageObject250 instance.251 252 Stability: Added in v1.0, will exist for all v1.x releases.253 """254 264 # ensure that we're not trying to access an encrypted PDF 255 265 assert not self.trailer.has_key("/Encrypt") … … 258 268 return self.flattenedPages[pageNumber] 259 269 270 ## 271 # Read-only property that emulates a list based upon the {@link 272 # #PdfFileReader.getNumPages getNumPages} and {@link #PdfFileReader.getPage 273 # getPage} functions. 274 # <p> 275 # Stability: Added in v1.7, and will exist for all future v1.x releases. 260 276 pages = property(lambda self: ConvertFunctionsToVirtualList(self.getNumPages, self.getPage), 261 None, None, """Returns a sequence of pages. This property was 262 added in v1.7 and will exist for all future v1.x releases.""") 277 None, None) 263 278 264 279 def _flatten(self, pages = None, inherit = None): … … 513 528 del self[name] 514 529 515 def addRectangleAccessor(klass, propname, name, fallback, docs):516 setattr(klass, propname,530 def createRectangleAccessor(name, fallback): 531 return \ 517 532 property( 518 533 lambda self: getRectangle(self, name, fallback), 519 534 lambda self, value: setRectangle(self, name, value), 520 lambda self: deleteRectangle(self, name), 521 docs 535 lambda self: deleteRectangle(self, name) 522 536 ) 523 ) 524 537 538 ## 539 # This class represents a single page within a PDF file. Typically this object 540 # will be created by accessing the {@link #PdfFileReader.getPage getPage} 541 # function of the {@link #PdfFileReader PdfFileReader} class. 525 542 class PageObject(DictionaryObject): 526 543 def __init__(self, pdf): … … 528 545 self.pdf = pdf 529 546 547 ## 548 # Rotates a page clockwise by increments of 90 degrees. 549 # <p> 550 # Stability: Added in v1.1, will exist for all future v1.x releases. 551 # @param angle Angle to rotate the page. Must be an increment of 90 deg. 530 552 def rotateClockwise(self, angle): 531 """532 Rotates a page clockwise by increments of 90 degrees.533 534 Stability: Added in v1.1, will exist for all v1.x releases thereafter.535 """536 553 assert angle % 90 == 0 537 554 self._rotate(angle) 538 555 return self 539 556 557 ## 558 # Rotates a page counter-clockwise by increments of 90 degrees. 559 # <p> 560 # Stability: Added in v1.1, will exist for all future v1.x releases. 561 # @param angle Angle to rotate the page. Must be an increment of 90 deg. 540 562 def rotateCounterClockwise(self, angle): 541 """542 Rotates a page counter-clockwise by increments of 90 degrees. Note543 that this is equivilant to calling rotateClockwise(-angle).544 545 Stability: Added in v1.1, will exist for all v1.x releases thereafter.546 """547 563 assert angle % 90 == 0 548 564 self._rotate(-angle) … … 590 606 _pushPopGS = staticmethod(_pushPopGS) 591 607 608 ## 609 # Merges the content streams of two pages into one. Resource references 610 # (i.e. fonts) are maintained from both pages. The mediabox/cropbox/etc 611 # of this page are not altered. The parameter page's content stream will 612 # be added to the end of this page's content stream, meaning that it will 613 # be drawn after, or "on top" of this page. 614 # <p> 615 # Stability: Added in v1.4, will exist for all future 1.x releases. 616 # @param page2 An instance of {@link #PageObject PageObject} to be merged 617 # into this one. 592 618 def mergePage(self, page2): 593 """594 Merges the content streams of two pages into one. Resource595 references (i.e. fonts) are maintained from both pages. The596 mediabox/cropbox/etc on "self" are not altered.597 598 Stability: Added in v1.4, will exist for all v1.x releases thereafter.599 """600 619 601 620 # First we work on merging the resource dictionaries. This allows us … … 634 653 self[NameObject('/Resources')] = newResources 635 654 655 ## 656 # Compresses the size of this page by joining all content streams and 657 # applying a FlateDecode filter. 658 # <p> 659 # Stability: Added in v1.6, will exist for all future v1.x releases. 660 # However, it is possible that this function will perform no action if 661 # content stream compression becomes "automatic" for some reason. 636 662 def compressContentStreams(self): 637 """638 Join all content streams and apply a FlateDecode filter to decrease639 the stream's size.640 641 Stability: Added in v1.6, will exist for all v1.x releases thereafter.642 However, if content stream compression is ever handled in a different643 and/or more transparent way, this function may not do anything.644 """645 663 content = self["/Contents"].getObject() 646 664 if not isinstance(content, ContentStream): … … 648 666 self[NameObject("/Contents")] = content.flateEncode() 649 667 668 ## 669 # Locate all text drawing commands, in the order they are provided in the 670 # content stream, and extract the text. This works well for some PDF 671 # files, but poorly for others, depending on the generator used. This will 672 # be refined in the future. Do not rely on the order of text coming out of 673 # this function, as it will change if this function is made more 674 # sophisticated. 675 # <p> 676 # Stability: Added in v1.7, will exist for all future v1.x releases. May 677 # be overhauled to provide more ordered text in the future. 678 # @return a string object 650 679 def extractText(self): 651 """652 Locate all text drawing commands, in the order they are provided in653 the content stream, and extract the text. This works well for some654 PDF files, but poorly for others, depending on the generator used.655 This will be refined in the future. Do not rely on the order of text656 coming out of this function, as it will change if this function is657 made more sophisticated.658 659 Stability: Added in v1.7, will exist for all v1.x releases thereafter.660 May be overhauled to provide more ordered text in the future.661 """662 680 text = "" 663 681 content = self["/Contents"].getObject() … … 681 699 return text 682 700 683 addRectangleAccessor(PageObject, "mediaBox", "/MediaBox", (), 684 """A rectangle, expressed in default user space units, defining the 685 boundaries of the physical medium on which the page is intended to be 686 displayed or printed. 687 688 Stability: Added in v1.4, will exist for all v1.x releases 689 thereafter.""") 690 addRectangleAccessor(PageObject, "cropBox", "/CropBox", ("/MediaBox",), 691 """A rectangle, expressed in default user space units, defining the 692 visible region of default user space. When the page is displayed or 693 printed, its contents are to be clipped (cropped) to this rectangle and 694 then imposed on the output medium in some implementation-defined 695 manner. Default value: same as MediaBox. 696 697 Stability: Added in v1.4, will exist for all v1.x releases 698 thereafter.""") 699 addRectangleAccessor(PageObject, "bleedBox", "/BleedBox", ("/CropBox", 700 "/MediaBox"), """A rectangle, expressed in default user space units, 701 defining the region to which the contents of the page should be clipped 702 when output in a production environment. 703 704 Stability: Added in v1.4, will exist for all v1.x releases 705 thereafter.""") 706 addRectangleAccessor(PageObject, "trimBox", "/TrimBox", ("/CropBox", 707 "/MediaBox"), """A rectangle, expressed in default user space units, 708 defining the intended dimensions of the finished page after trimming. 709 710 Stability: Added in v1.4, will exist for all v1.x releases 711 thereafter.""") 712 addRectangleAccessor(PageObject, "artBox", "/ArtBox", ("/CropBox", 713 "/MediaBox"), """A rectangle, expressed in default user space units, 714 defining the extent of the page's meaningful content as intended by the 715 page's creator. 716 717 Stability: Added in v1.4, will exist for all v1.x releases 718 thereafter.""") 701 ## 702 # A rectangle (RectangleObject), expressed in default user space units, 703 # defining the boundaries of the physical medium on which the page is 704 # intended to be displayed or printed. 705 # <p> 706 # Stability: Added in v1.4, will exist for all future v1.x releases. 707 mediaBox = createRectangleAccessor("/MediaBox", ()) 708 709 ## 710 # A rectangle (RectangleObject), expressed in default user space units, 711 # defining the visible region of default user space. When the page is 712 # displayed or printed, its contents are to be clipped (cropped) to this 713 # rectangle and then imposed on the output medium in some 714 # implementation-defined manner. Default value: same as MediaBox. 715 # <p> 716 # Stability: Added in v1.4, will exist for all future v1.x releases. 717 cropBox = createRectangleAccessor("/CropBox", ("/CropBox",)) 718 719 ## 720 # A rectangle (RectangleObject), expressed in default user space units, 721 # defining the region to which the contents of the page should be clipped 722 # when output in a production enviroment. 723 # <p> 724 # Stability: Added in v1.4, will exist for all future v1.x releases. 725 bleedBox = createRectangleAccessor("/BleedBox", ("/CropBox", "/MediaBox")) 726 727 ## 728 # A rectangle (RectangleObject), expressed in default user space units, 729 # defining the intended dimensions of the finished page after trimming. 730 # <p> 731 # Stability: Added in v1.4, will exist for all future v1.x releases. 732 trimBox = createRectangleAccessor("/TrimBox", ("/CropBox", "/MediaBox")) 733 734 ## 735 # A rectangle (RectangleObject), expressed in default user space units, 736 # defining the extent of the page's meaningful content as intended by the 737 # page's creator. 738 # <p> 739 # Stability: Added in v1.4, will exist for all future v1.x releases. 740 artBox = createRectangleAccessor("/ArtBox", ("/CropBox", "/MediaBox")) 719 741 720 742 … … 817 839 818 840 841 ## 842 # A class representing the basic document metadata provided in a PDF File. 819 843 class DocumentInformation(DictionaryObject): 820 844 def __init__(self): 821 845 DictionaryObject.__init__(self) 822 846 823 title = property( 824 lambda self: self.get("/Title", None), 825 None, None, 826 """The document's title, or None if not specified. Added to pyPdf 827 in v1.6, will exist for all v1.x.""") 828 829 author = property( 830 lambda self: self.get("/Author", None), 831 None, None, 832 """The name of the person who created the document, or None if not 833 specified. Added to pyPdf in v1.6, will exist for all v1.x.""") 834 835 subject = property( 836 lambda self: self.get("/Subject", None), 837 None, None, 838 """The subject of the document, or None if not specified. Added to 839 pyPdf in v1.6, will exist for all v1.x.""") 840 841 creator = property( 842 lambda self: self.get("/Creator", None), 843 None, None, 844 """If the document was converted to PDF from another format, the 845 name of the application (for example, OpenOffice) that created the 846 original document from which it was converted, or None if not 847 specified. Added to pyPdf in v1.6, will exist for all v1.x.""") 848 849 producer = property( 850 lambda self: self.get("/Producer", None), 851 None, None, 852 """If the document was converted to PDF from another format, the 853 name of the application (for example, OSX Quartz) that converted it 854 to PDF. Added to pyPdf in v1.6, will exist for all v1.x.""") 847 ## 848 # Read-only property accessing the document's title. Added in v1.6, will 849 # exist for all future v1.x releases. 850 # @return A string, or None if the title is not provided. 851 title = property(lambda self: self.get("/Title", None), None, None) 852 853 ## 854 # Read-only property accessing the document's author. Added in v1.6, will 855 # exist for all future v1.x releases. 856 # @return A string, or None if the author is not provided. 857 author = property(lambda self: self.get("/Author", None), None, None) 858 859 ## 860 # Read-only property accessing the subject of the document. Added in v1.6, 861 # will exist for all future v1.x releases. 862 # @return A string, or None if the subject is not provided. 863 subject = property(lambda self: self.get("/Subject", None), None, None) 864 865 ## 866 # Read-only property accessing the document's creator. If the document was 867 # converted to PDF from another format, the name of the application (for 868 # example, OpenOffice) that created the original document from which it was 869 # converted. Added in v1.6, will exist for all future v1.x releases. 870 # @return A string, or None if the creator is not provided. 871 creator = property(lambda self: self.get("/Creator", None), None, None) 872 873 ## 874 # Read-only property accessing the document's producer. If the document 875 # was converted to PDF from another format, the name of the application 876 # (for example, OSX Quartz) that converted it to PDF. Added in v1.6, will 877 # exist for all future v1.x releases. 878 # @return A string, or None if the producer is not provided. 879 producer = property(lambda self: self.get("/Producer", None), None, None) 855 880 856 881 … … 869 894 870 895 871 if __name__ == "__main__":872 output = PdfFileWriter()873 874 input1 = PdfFileReader(file("test\\5000-s1-05e.pdf", "rb"))875 page1 = input1.getPage(0)876 877 input2 = PdfFileReader(file("test\\PDFReference16.pdf", "rb"))878 page2 = input2.getPage(0)879 page3 = input2.getPage(1)880 page1.mergePage(page2)881 page1.mergePage(page3)882 883 input3 = PdfFileReader(file("test\\cc-cc.pdf", "rb"))884 page1.mergePage(input3.getPage(0))885 886 page1.compressContentStreams()887 888 output.addPage(page1)889 output.write(file("test\\merge-test.pdf", "wb"))890 891 896 #if __name__ == "__main__": 897 # output = PdfFileWriter() 898 # 899 # input1 = PdfFileReader(file("test\\5000-s1-05e.pdf", "rb")) 900 # page1 = input1.getPage(0) 901 # 902 # input2 = PdfFileReader(file("test\\PDFReference16.pdf", "rb")) 903 # page2 = input2.getPage(0) 904 # page3 = input2.getPage(1) 905 # page1.mergePage(page2) 906 # page1.mergePage(page3) 907 # 908 # input3 = PdfFileReader(file("test\\cc-cc.pdf", "rb")) 909 # page1.mergePage(input3.getPage(0)) 910 # 911 # page1.compressContentStreams() 912 # 913 # output.addPage(page1) 914 # output.write(file("test\\merge-test.pdf", "wb")) 915 916
