sRDCKtu2

· 7 years ago · Jan 10, 2019, 10:12 AM
1#!/usr/local/bin/python
2# -*- coding: latin-1 -*-
3"""
4OleFileIO_PL:
5    Module to read Microsoft OLE2 files (also called Structured Storage or
6    Microsoft Compound Document File Format), such as Microsoft Office
7    documents, Image Composer and FlashPix files, Outlook messages, ...
8
9version 0.26 2013-07-24 Philippe Lagadec - http://www.decalage.info
10
11Project website: http://www.decalage.info/python/olefileio
12
13Improved version of the OleFileIO module from PIL library v1.1.6
14See: http://www.pythonware.com/products/pil/index.htm
15
16The Python Imaging Library (PIL) is
17    Copyright (c) 1997-2005 by Secret Labs AB
18    Copyright (c) 1995-2005 by Fredrik Lundh
19OleFileIO_PL changes are Copyright (c) 2005-2013 by Philippe Lagadec
20
21See source code and LICENSE.txt for information on usage and redistribution.
22
23WARNING: THIS IS (STILL) WORK IN PROGRESS.
24"""
25
26__author__  = "Philippe Lagadec, Fredrik Lundh (Secret Labs AB)"
27__date__    = "2013-07-24"
28__version__ = '0.26'
29
30#--- LICENSE ------------------------------------------------------------------
31
32# OleFileIO_PL is an improved version of the OleFileIO module from the
33# Python Imaging Library (PIL).
34
35# OleFileIO_PL changes are Copyright (c) 2005-2013 by Philippe Lagadec
36#
37# The Python Imaging Library (PIL) is
38#    Copyright (c) 1997-2005 by Secret Labs AB
39#    Copyright (c) 1995-2005 by Fredrik Lundh
40#
41# By obtaining, using, and/or copying this software and/or its associated
42# documentation, you agree that you have read, understood, and will comply with
43# the following terms and conditions:
44#
45# Permission to use, copy, modify, and distribute this software and its
46# associated documentation for any purpose and without fee is hereby granted,
47# provided that the above copyright notice appears in all copies, and that both
48# that copyright notice and this permission notice appear in supporting
49# documentation, and that the name of Secret Labs AB or the author(s) not be used
50# in advertising or publicity pertaining to distribution of the software
51# without specific, written prior permission.
52#
53# SECRET LABS AB AND THE AUTHORS DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
54# SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
55# IN NO EVENT SHALL SECRET LABS AB OR THE AUTHORS BE LIABLE FOR ANY SPECIAL,
56# INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
57# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
58# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
59# PERFORMANCE OF THIS SOFTWARE.
60
61#-----------------------------------------------------------------------------
62# CHANGELOG: (only OleFileIO_PL changes compared to PIL 1.1.6)
63# 2005-05-11 v0.10 PL: - a few fixes for Python 2.4 compatibility
64#                        (all changes flagged with [PL])
65# 2006-02-22 v0.11 PL: - a few fixes for some Office 2003 documents which raise
66#                        exceptions in _OleStream.__init__()
67# 2006-06-09 v0.12 PL: - fixes for files above 6.8MB (DIFAT in loadfat)
68#                      - added some constants
69#                      - added header values checks
70#                      - added some docstrings
71#                      - getsect: bugfix in case sectors >512 bytes
72#                      - getsect: added conformity checks
73#                      - DEBUG_MODE constant to activate debug display
74# 2007-09-04 v0.13 PL: - improved/translated (lots of) comments
75#                      - updated license
76#                      - converted tabs to 4 spaces
77# 2007-11-19 v0.14 PL: - added OleFileIO._raise_defect() to adapt sensitivity
78#                      - improved _unicode() to use Python 2.x unicode support
79#                      - fixed bug in _OleDirectoryEntry
80# 2007-11-25 v0.15 PL: - added safety checks to detect FAT loops
81#                      - fixed _OleStream which didn't check stream size
82#                      - added/improved many docstrings and comments
83#                      - moved helper functions _unicode and _clsid out of
84#                        OleFileIO class
85#                      - improved OleFileIO._find() to add Unix path syntax
86#                      - OleFileIO._find() is now case-insensitive
87#                      - added get_type() and get_rootentry_name()
88#                      - rewritten loaddirectory and _OleDirectoryEntry
89# 2007-11-27 v0.16 PL: - added _OleDirectoryEntry.kids_dict
90#                      - added detection of duplicate filenames in storages
91#                      - added detection of duplicate references to streams
92#                      - added get_size() and exists() to _OleDirectoryEntry
93#                      - added isOleFile to check header before parsing
94#                      - added __all__ list to control public keywords in pydoc
95# 2007-12-04 v0.17 PL: - added _load_direntry to fix a bug in loaddirectory
96#                      - improved _unicode(), added workarounds for Python <2.3
97#                      - added set_debug_mode and -d option to set debug mode
98#                      - fixed bugs in OleFileIO.open and _OleDirectoryEntry
99#                      - added safety check in main for large or binary
100#                        properties
101#                      - allow size>0 for storages for some implementations
102# 2007-12-05 v0.18 PL: - fixed several bugs in handling of FAT, MiniFAT and
103#                        streams
104#                      - added option '-c' in main to check all streams
105# 2009-12-10 v0.19 PL: - bugfix for 32 bit arrays on 64 bits platforms
106#                        (thanks to Ben G. and Martijn for reporting the bug)
107# 2009-12-11 v0.20 PL: - bugfix in OleFileIO.open when filename is not plain str
108# 2010-01-22 v0.21 PL: - added support for big-endian CPUs such as PowerPC Macs
109# 2012-02-16 v0.22 PL: - fixed bug in getproperties, patch by chuckleberryfinn
110#                        (https://bitbucket.org/decalage/olefileio_pl/issue/7)
111#                      - added close method to OleFileIO (fixed issue #2)
112# 2012-07-25 v0.23 PL: - added support for file-like objects (patch by mete0r_kr)
113# 2013-05-05 v0.24 PL: - getproperties: added conversion from filetime to python
114#                        datetime
115#                      - main: displays properties with date format
116#                      - new class OleMetadata to parse standard properties
117#                      - added get_metadata method
118# 2013-05-07 v0.24 PL: - a few improvements in OleMetadata
119# 2013-05-24 v0.25 PL: - getproperties: option to not convert some timestamps
120#                      - OleMetaData: total_edit_time is now a number of seconds,
121#                        not a timestamp
122#                      - getproperties: added support for VT_BOOL, VT_INT, V_UINT
123#                      - getproperties: filter out null chars from strings
124#                      - getproperties: raise non-fatal defects instead of
125#                        exceptions when properties cannot be parsed properly
126# 2013-05-27       PL: - getproperties: improved exception handling
127#                      - _raise_defect: added option to set exception type
128#                      - all non-fatal issues are now recorded, and displayed
129#                        when run as a script
130# 2013-07-11 v0.26 PL: - added methods to get modification and creation times
131#                        of a directory entry or a storage/stream
132#                      - fixed parsing of direntry timestamps
133# 2013-07-24       PL: - new options in listdir to list storages and/or streams
134
135#-----------------------------------------------------------------------------
136# TODO (for version 1.0):
137# + add path attrib to _OleDirEntry, set it once and for all in init or
138#   append_kids (then listdir/_list can be simplified)
139# - TESTS with Linux, MacOSX, Python 1.5.2, various files, PIL, ...
140# - add underscore to each private method, to avoid their display in
141#   pydoc/epydoc documentation - Remove it for classes to be documented
142# - replace all raised exceptions with _raise_defect (at least in OleFileIO)
143# - merge code from _OleStream and OleFileIO.getsect to read sectors
144#   (maybe add a class for FAT and MiniFAT ?)
145# - add method to check all streams (follow sectors chains without storing all
146#   stream in memory, and report anomalies)
147# - use _OleDirectoryEntry.kids_dict to improve _find and _list ?
148# - fix Unicode names handling (find some way to stay compatible with Py1.5.2)
149#   => if possible avoid converting names to Latin-1
150# - review DIFAT code: fix handling of DIFSECT blocks in FAT (not stop)
151# - rewrite OleFileIO.getproperties
152# - improve docstrings to show more sample uses
153# - see also original notes and FIXME below
154# - remove all obsolete FIXMEs
155# - OleMetadata: fix version attrib according to
156#   http://msdn.microsoft.com/en-us/library/dd945671%28v=office.12%29.aspx
157
158# IDEAS:
159# - in OleFileIO._open and _OleStream, use size=None instead of 0x7FFFFFFF for
160#   streams with unknown size
161# - use arrays of int instead of long integers for FAT/MiniFAT, to improve
162#   performance and reduce memory usage ? (possible issue with values >2^31)
163# - provide tests with unittest (may need write support to create samples)
164# - move all debug code (and maybe dump methods) to a separate module, with
165#   a class which inherits OleFileIO ?
166# - fix docstrings to follow epydoc format
167# - add support for 4K sectors ?
168# - add support for big endian byte order ?
169# - create a simple OLE explorer with wxPython
170
171# FUTURE EVOLUTIONS to add write support:
172# 1) add ability to write a stream back on disk from StringIO (same size, no
173#    change in FAT/MiniFAT).
174# 2) rename a stream/storage if it doesn't change the RB tree
175# 3) use rbtree module to update the red-black tree + any rename
176# 4) remove a stream/storage: free sectors in FAT/MiniFAT
177# 5) allocate new sectors in FAT/MiniFAT
178# 6) create new storage/stream
179#-----------------------------------------------------------------------------
180
181#
182# THIS IS WORK IN PROGRESS
183#
184# The Python Imaging Library
185# $Id: OleFileIO.py 2339 2005-03-25 08:02:17Z fredrik $
186#
187# stuff to deal with OLE2 Structured Storage files.  this module is
188# used by PIL to read Image Composer and FlashPix files, but can also
189# be used to read other files of this type.
190#
191# History:
192# 1997-01-20 fl   Created
193# 1997-01-22 fl   Fixed 64-bit portability quirk
194# 2003-09-09 fl   Fixed typo in OleFileIO.loadfat (noted by Daniel Haertle)
195# 2004-02-29 fl   Changed long hex constants to signed integers
196#
197# Notes:
198# FIXME: sort out sign problem (eliminate long hex constants)
199# FIXME: change filename to use "a/b/c" instead of ["a", "b", "c"]
200# FIXME: provide a glob mechanism function (using fnmatchcase)
201#
202# Literature:
203#
204# "FlashPix Format Specification, Appendix A", Kodak and Microsoft,
205#  September 1996.
206#
207# Quotes:
208#
209# "If this document and functionality of the Software conflict,
210#  the actual functionality of the Software represents the correct
211#  functionality" -- Microsoft, in the OLE format specification
212#
213# Copyright (c) Secret Labs AB 1997.
214# Copyright (c) Fredrik Lundh 1997.
215#
216# See the README file for information on usage and redistribution.
217#
218
219#------------------------------------------------------------------------------
220
221import string, StringIO, struct, array, os.path, sys, datetime
222
223#[PL] Define explicitly the public API to avoid private objects in pydoc:
224__all__ = ['OleFileIO', 'isOleFile']
225
226#[PL] workaround to fix an issue with array item size on 64 bits systems:
227if array.array('L').itemsize == 4:
228    # on 32 bits platforms, long integers in an array are 32 bits:
229    UINT32 = 'L'
230elif array.array('I').itemsize == 4:
231    # on 64 bits platforms, integers in an array are 32 bits:
232    UINT32 = 'I'
233else:
234    raise ValueError, 'Need to fix a bug with 32 bit arrays, please contact author...'
235
236
237#[PL] These workarounds were inspired from the Path module
238# (see http://www.jorendorff.com/articles/python/path/)
239#TODO: test with old Python versions
240
241# Pre-2.3 workaround for booleans
242try:
243    True, False
244except NameError:
245    True, False = 1, 0
246
247# Pre-2.3 workaround for basestring.
248try:
249    basestring
250except NameError:
251    try:
252        # is Unicode supported (Python >2.0 or >1.6 ?)
253        basestring = (str, unicode)
254    except NameError:
255        basestring = str
256
257#[PL] Experimental setting: if True, OLE filenames will be kept in Unicode
258# if False (default PIL behaviour), all filenames are converted to Latin-1.
259KEEP_UNICODE_NAMES = False
260
261#[PL] DEBUG display mode: False by default, use set_debug_mode() or "-d" on
262# command line to change it.
263DEBUG_MODE = False
264def debug_print(msg):
265    print msg
266def debug_pass(msg):
267    pass
268debug = debug_pass
269
270def set_debug_mode(debug_mode):
271    """
272    Set debug mode on or off, to control display of debugging messages.
273    mode: True or False
274    """
275    global DEBUG_MODE, debug
276    DEBUG_MODE = debug_mode
277    if debug_mode:
278        debug = debug_print
279    else:
280        debug = debug_pass
281
282#TODO: convert this to hex
283MAGIC = '\320\317\021\340\241\261\032\341'
284
285#[PL]: added constants for Sector IDs (from AAF specifications)
286MAXREGSECT = 0xFFFFFFFAL; # maximum SECT
287DIFSECT    = 0xFFFFFFFCL; # (-4) denotes a DIFAT sector in a FAT
288FATSECT    = 0xFFFFFFFDL; # (-3) denotes a FAT sector in a FAT
289ENDOFCHAIN = 0xFFFFFFFEL; # (-2) end of a virtual stream chain
290FREESECT   = 0xFFFFFFFFL; # (-1) unallocated sector
291
292#[PL]: added constants for Directory Entry IDs (from AAF specifications)
293MAXREGSID  = 0xFFFFFFFAL; # maximum directory entry ID
294NOSTREAM   = 0xFFFFFFFFL; # (-1) unallocated directory entry
295
296#[PL] object types in storage (from AAF specifications)
297STGTY_EMPTY     = 0 # empty directory entry (according to OpenOffice.org doc)
298STGTY_STORAGE   = 1 # element is a storage object
299STGTY_STREAM    = 2 # element is a stream object
300STGTY_LOCKBYTES = 3 # element is an ILockBytes object
301STGTY_PROPERTY  = 4 # element is an IPropertyStorage object
302STGTY_ROOT      = 5 # element is a root storage
303
304
305#
306# --------------------------------------------------------------------
307# property types
308
309VT_EMPTY=0; VT_NULL=1; VT_I2=2; VT_I4=3; VT_R4=4; VT_R8=5; VT_CY=6;
310VT_DATE=7; VT_BSTR=8; VT_DISPATCH=9; VT_ERROR=10; VT_BOOL=11;
311VT_VARIANT=12; VT_UNKNOWN=13; VT_DECIMAL=14; VT_I1=16; VT_UI1=17;
312VT_UI2=18; VT_UI4=19; VT_I8=20; VT_UI8=21; VT_INT=22; VT_UINT=23;
313VT_VOID=24; VT_HRESULT=25; VT_PTR=26; VT_SAFEARRAY=27; VT_CARRAY=28;
314VT_USERDEFINED=29; VT_LPSTR=30; VT_LPWSTR=31; VT_FILETIME=64;
315VT_BLOB=65; VT_STREAM=66; VT_STORAGE=67; VT_STREAMED_OBJECT=68;
316VT_STORED_OBJECT=69; VT_BLOB_OBJECT=70; VT_CF=71; VT_CLSID=72;
317VT_VECTOR=0x1000;
318
319# map property id to name (for debugging purposes)
320
321VT = {}
322for keyword, var in vars().items():
323    if keyword[:3] == "VT_":
324        VT[var] = keyword
325
326#
327# --------------------------------------------------------------------
328# Some common document types (root.clsid fields)
329
330WORD_CLSID = "00020900-0000-0000-C000-000000000046"
331#TODO: check Excel, PPT, ...
332
333#[PL]: Defect levels to classify parsing errors - see OleFileIO._raise_defect()
334DEFECT_UNSURE =    10    # a case which looks weird, but not sure it's a defect
335DEFECT_POTENTIAL = 20    # a potential defect
336DEFECT_INCORRECT = 30    # an error according to specifications, but parsing
337                         # can go on
338DEFECT_FATAL =     40    # an error which cannot be ignored, parsing is
339                         # impossible
340
341#[PL] add useful constants to __all__:
342for key in vars().keys():
343    if key.startswith('STGTY_') or key.startswith('DEFECT_'):
344        __all__.append(key)
345
346
347#--- FUNCTIONS ----------------------------------------------------------------
348
349def isOleFile (filename):
350    """
351    Test if file is an OLE container (according to its header).
352    filename: file name or path (str, unicode)
353    return: True if OLE, False otherwise.
354    """
355    f = open(filename, 'rb')
356    header = f.read(len(MAGIC))
357    if header == MAGIC:
358        return True
359    else:
360        return False
361
362
363#TODO: replace i16 and i32 with more readable struct.unpack equivalent
364def i16(c, o = 0):
365    """
366    Converts a 2-bytes (16 bits) string to an integer.
367
368    c: string containing bytes to convert
369    o: offset of bytes to convert in string
370    """
371    return ord(c[o])+(ord(c[o+1])<<8)
372
373
374def i32(c, o = 0):
375    """
376    Converts a 4-bytes (32 bits) string to an integer.
377
378    c: string containing bytes to convert
379    o: offset of bytes to convert in string
380    """
381    return int(ord(c[o])+(ord(c[o+1])<<8)+(ord(c[o+2])<<16)+(ord(c[o+3])<<24))
382    # [PL]: added int() because "<<" gives long int since Python 2.4
383
384
385def _clsid(clsid):
386    """
387    Converts a CLSID to a human-readable string.
388    clsid: string of length 16.
389    """
390    assert len(clsid) == 16
391    if clsid == "\0" * len(clsid):
392        return ""
393    return (("%08X-%04X-%04X-%02X%02X-" + "%02X" * 6) %
394            ((i32(clsid, 0), i16(clsid, 4), i16(clsid, 6)) +
395            tuple(map(ord, clsid[8:16]))))
396
397
398
399# UNICODE support for Old Python versions:
400# (necessary to handle storages/streams names which use Unicode)
401
402try:
403    # is Unicode supported ?
404    unicode
405
406    def _unicode(s, errors='replace'):
407        """
408        Map unicode string to Latin 1. (Python with Unicode support)
409
410        s: UTF-16LE unicode string to convert to Latin-1
411        errors: 'replace', 'ignore' or 'strict'. See Python doc for unicode()
412        """
413        #TODO: test if it OleFileIO works with Unicode strings, instead of
414        #      converting to Latin-1.
415        try:
416            # First the string is converted to plain Unicode:
417            # (assuming it is encoded as UTF-16 little-endian)
418            u = s.decode('UTF-16LE', errors)
419            if KEEP_UNICODE_NAMES:
420                return u
421            else:
422                # Second the unicode string is converted to Latin-1
423                return u.encode('latin_1', errors)
424        except:
425            # there was an error during Unicode to Latin-1 conversion:
426            raise IOError ('incorrect Unicode name')
427
428except NameError:
429    def _unicode(s, errors='replace'):
430        """
431        Map unicode string to Latin 1. (Python without native Unicode support)
432
433        s: UTF-16LE unicode string to convert to Latin-1
434        errors: 'replace', 'ignore' or 'strict'. (ignored in this version)
435        """
436        # If the unicode function does not exist, we assume this is an old
437        # Python version without Unicode support.
438        # Null bytes are simply removed (this only works with usual Latin-1
439        # strings which do not contain unicode characters>256):
440        return filter(ord, s)
441
442
443def filetime2datetime(filetime):
444        """
445        convert FILETIME (64 bits int) to Python datetime.datetime
446        """
447        # TODO: manage exception when microseconds is too large
448        # inspired from http://code.activestate.com/recipes/511425-filetime-to-datetime/
449        _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0)
450        #debug('timedelta days=%d' % (filetime/(10*1000000*3600*24)))
451        return _FILETIME_null_date + datetime.timedelta(microseconds=filetime/10)
452
453
454
455#=== CLASSES ==================================================================
456
457class OleMetadata:
458    """
459    class to parse and store metadata from standard properties of OLE files.
460
461    Available attributes:
462    codepage, title, subject, author, keywords, comments, template,
463    last_saved_by, revision_number, total_edit_time, last_printed, create_time,
464    last_saved_time, num_pages, num_words, num_chars, thumbnail,
465    creating_application, security, codepage_doc, category, presentation_target,
466    bytes, lines, paragraphs, slides, notes, hidden_slides, mm_clips,
467    scale_crop, heading_pairs, titles_of_parts, manager, company, links_dirty,
468    chars_with_spaces, unused, shared_doc, link_base, hlinks, hlinks_changed,
469    version, dig_sig, content_type, content_status, language, doc_version
470
471    Note: an attribute is set to None when not present in the properties of the
472    OLE file.
473
474    References for SummaryInformation stream:
475    - http://msdn.microsoft.com/en-us/library/dd942545.aspx
476    - http://msdn.microsoft.com/en-us/library/dd925819%28v=office.12%29.aspx
477    - http://msdn.microsoft.com/en-us/library/windows/desktop/aa380376%28v=vs.85%29.aspx
478    - http://msdn.microsoft.com/en-us/library/aa372045.aspx
479    - http://sedna-soft.de/summary-information-stream/
480    - http://poi.apache.org/apidocs/org/apache/poi/hpsf/SummaryInformation.html
481
482    References for DocumentSummaryInformation stream:
483    - http://msdn.microsoft.com/en-us/library/dd945671%28v=office.12%29.aspx
484    - http://msdn.microsoft.com/en-us/library/windows/desktop/aa380374%28v=vs.85%29.aspx
485    - http://poi.apache.org/apidocs/org/apache/poi/hpsf/DocumentSummaryInformation.html
486
487    new in version 0.25
488    """
489
490    # attribute names for SummaryInformation stream properties:
491    # (ordered by property id, starting at 1)
492    SUMMARY_ATTRIBS = ['codepage', 'title', 'subject', 'author', 'keywords', 'comments',
493        'template', 'last_saved_by', 'revision_number', 'total_edit_time',
494        'last_printed', 'create_time', 'last_saved_time', 'num_pages',
495        'num_words', 'num_chars', 'thumbnail', 'creating_application',
496        'security']
497
498    # attribute names for DocumentSummaryInformation stream properties:
499    # (ordered by property id, starting at 1)
500    DOCSUM_ATTRIBS = ['codepage_doc', 'category', 'presentation_target', 'bytes', 'lines', 'paragraphs',
501        'slides', 'notes', 'hidden_slides', 'mm_clips',
502        'scale_crop', 'heading_pairs', 'titles_of_parts', 'manager',
503        'company', 'links_dirty', 'chars_with_spaces', 'unused', 'shared_doc',
504        'link_base', 'hlinks', 'hlinks_changed', 'version', 'dig_sig',
505        'content_type', 'content_status', 'language', 'doc_version']
506
507    def __init__(self):
508        """
509        Constructor for OleMetadata
510        All attributes are set to None by default
511        """
512        # properties from SummaryInformation stream
513        self.codepage = None
514        self.title = None
515        self.subject = None
516        self.author = None
517        self.keywords = None
518        self.comments = None
519        self.template = None
520        self.last_saved_by = None
521        self.revision_number = None
522        self.total_edit_time = None
523        self.last_printed = None
524        self.create_time = None
525        self.last_saved_time = None
526        self.num_pages = None
527        self.num_words = None
528        self.num_chars = None
529        self.thumbnail = None
530        self.creating_application = None
531        self.security = None
532        # properties from DocumentSummaryInformation stream
533        self.codepage_doc = None
534        self.category = None
535        self.presentation_target = None
536        self.bytes = None
537        self.lines = None
538        self.paragraphs = None
539        self.slides = None
540        self.notes = None
541        self.hidden_slides = None
542        self.mm_clips = None
543        self.scale_crop = None
544        self.heading_pairs = None
545        self.titles_of_parts = None
546        self.manager = None
547        self.company = None
548        self.links_dirty = None
549        self.chars_with_spaces = None
550        self.unused = None
551        self.shared_doc = None
552        self.link_base = None
553        self.hlinks = None
554        self.hlinks_changed = None
555        self.version = None
556        self.dig_sig = None
557        self.content_type = None
558        self.content_status = None
559        self.language = None
560        self.doc_version = None
561
562
563    def parse_properties(self, olefile):
564        """
565        Parse standard properties of an OLE file, from the streams
566        "\x05SummaryInformation" and "\x05DocumentSummaryInformation",
567        if present.
568        Properties are converted to strings, integers or python datetime objects.
569        If a property is not present, its value is set to None.
570        """
571        # first set all attributes to None:
572        for attrib in (self.SUMMARY_ATTRIBS + self.DOCSUM_ATTRIBS):
573            setattr(self, attrib, None)
574        if olefile.exists("\x05SummaryInformation"):
575            # get properties from the stream:
576            # (converting timestamps to python datetime, except total_edit_time,
577            # which is property #10)
578            props = olefile.getproperties("\x05SummaryInformation",
579                convert_time=True, no_conversion=[10])
580            # store them into this object's attributes:
581            for i in range(len(self.SUMMARY_ATTRIBS)):
582                # ids for standards properties start at 0x01, until 0x13
583                value = props.get(i+1, None)
584                setattr(self, self.SUMMARY_ATTRIBS[i], value)
585        if olefile.exists("\x05DocumentSummaryInformation"):
586            # get properties from the stream:
587            props = olefile.getproperties("\x05DocumentSummaryInformation",
588                convert_time=True)
589            # store them into this object's attributes:
590            for i in range(len(self.DOCSUM_ATTRIBS)):
591                # ids for standards properties start at 0x01, until 0x13
592                value = props.get(i+1, None)
593                setattr(self, self.DOCSUM_ATTRIBS[i], value)
594
595    def dump(self):
596        """
597        Dump all metadata, for debugging purposes.
598        """
599        print 'Properties from SummaryInformation stream:'
600        for prop in self.SUMMARY_ATTRIBS:
601            value = getattr(self, prop)
602            print '- %s: %s' % (prop, repr(value))
603        print 'Properties from DocumentSummaryInformation stream:'
604        for prop in self.DOCSUM_ATTRIBS:
605            value = getattr(self, prop)
606            print '- %s: %s' % (prop, repr(value))
607
608
609#--- _OleStream ---------------------------------------------------------------
610
611class _OleStream(StringIO.StringIO):
612    """
613    OLE2 Stream
614
615    Returns a read-only file object which can be used to read
616    the contents of a OLE stream (instance of the StringIO class).
617    To open a stream, use the openstream method in the OleFile class.
618
619    This function can be used with either ordinary streams,
620    or ministreams, depending on the offset, sectorsize, and
621    fat table arguments.
622
623    Attributes:
624        - size: actual size of data stream, after it was opened.
625    """
626
627    # FIXME: should store the list of sects obtained by following
628    # the fat chain, and load new sectors on demand instead of
629    # loading it all in one go.
630
631    def __init__(self, fp, sect, size, offset, sectorsize, fat, filesize):
632        """
633        Constructor for _OleStream class.
634
635        fp        : file object, the OLE container or the MiniFAT stream
636        sect      : sector index of first sector in the stream
637        size      : total size of the stream
638        offset    : offset in bytes for the first FAT or MiniFAT sector
639        sectorsize: size of one sector
640        fat       : array/list of sector indexes (FAT or MiniFAT)
641        filesize  : size of OLE file (for debugging)
642        return    : a StringIO instance containing the OLE stream
643        """
644        debug('_OleStream.__init__:')
645        debug('  sect=%d (%X), size=%d, offset=%d, sectorsize=%d, len(fat)=%d, fp=%s'
646            %(sect,sect,size,offset,sectorsize,len(fat), repr(fp)))
647        #[PL] To detect malformed documents with FAT loops, we compute the
648        # expected number of sectors in the stream:
649        unknown_size = False
650        if size==0x7FFFFFFF:
651            # this is the case when called from OleFileIO._open(), and stream
652            # size is not known in advance (for example when reading the
653            # Directory stream). Then we can only guess maximum size:
654            size = len(fat)*sectorsize
655            # and we keep a record that size was unknown:
656            unknown_size = True
657            debug('  stream with UNKNOWN SIZE')
658        nb_sectors = (size + (sectorsize-1)) / sectorsize
659        debug('nb_sectors = %d' % nb_sectors)
660        # This number should (at least) be less than the total number of
661        # sectors in the given FAT:
662        if nb_sectors > len(fat):
663            raise IOError ('malformed OLE document, stream too large')
664        # optimization(?): data is first a list of strings, and join() is called
665        # at the end to concatenate all in one string.
666        # (this may not be really useful with recent Python versions)
667        data = []
668        # if size is zero, then first sector index should be ENDOFCHAIN:
669        if size == 0 and sect != ENDOFCHAIN:
670            debug('size == 0 and sect != ENDOFCHAIN:')
671            raise IOError ('incorrect OLE sector index for empty stream')
672        #[PL] A fixed-length for loop is used instead of an undefined while
673        # loop to avoid DoS attacks:
674        for i in xrange(nb_sectors):
675            # Sector index may be ENDOFCHAIN, but only if size was unknown
676            if sect == ENDOFCHAIN:
677                if unknown_size:
678                    break
679                else:
680                    # else this means that the stream is smaller than declared:
681                    debug('sect=ENDOFCHAIN before expected size')
682                    raise IOError ('incomplete OLE stream')
683            # sector index should be within FAT:
684            if sect<0 or sect>=len(fat):
685                debug('sect=%d (%X) / len(fat)=%d' % (sect, sect, len(fat)))
686                debug('i=%d / nb_sectors=%d' %(i, nb_sectors))
687##                tmp_data = string.join(data, "")
688##                f = open('test_debug.bin', 'wb')
689##                f.write(tmp_data)
690##                f.close()
691##                debug('data read so far: %d bytes' % len(tmp_data))
692                raise IOError('incorrect OLE FAT, sector index out of range')
693            #TODO: merge this code with OleFileIO.getsect() ?
694            #TODO: check if this works with 4K sectors:
695            try:
696                fp.seek(offset + sectorsize * sect)
697            except:
698                debug('sect=%d, seek=%d, filesize=%d' %
699                    (sect, offset+sectorsize*sect, filesize))
700                raise IOError(OLE sector index out of range'
701            sector_data = fp.read(sectorsize)
702            # [PL] check if there was enough data:
703            # Note: if sector is the last of the file, sometimes it is not a
704            # complete sector (of 512 or 4K), so we may read less than
705            # sectorsize.
706            if len(sector_data)!=sectorsize and sect!=(len(fat)-1):
707                debug('sect=%d / len(fat)=%d, seek=%d / filesize=%d, len read=%d' %
708                    (sect, len(fat), offset+sectorsize*sect, filesize, len(sector_data)))
709                debug('seek+len(read)=%d' % (offset+sectorsize*sect+len(sector_data)))
710                raise IOError ('incomplete OLE sector')
711            data.append(sector_data)
712            # jump to next sector in the FAT:
713            try:
714                sect = fat[sect]
715            except IndexError:
716                # [PL] if pointer is out of the FAT an exception is raised
717                raise IOError('incorrect OLE FAT, sector index out of range'
718        #[PL] Last sector should be a "end of chain" marker:
719        if sect != ENDOFCHAIN:
720            raise IOError ('incorrect last sector index in OLE stream')
721        data = string.join(data, "")
722        # Data is truncated to the actual stream size:
723        if len(data) >= size:
724            data = data[:size]
725            # actual stream size is stored for future use:
726            self.size = size
727        elif unknown_size:
728            # actual stream size was not known, now we know the size of read
729            # data:
730            self.size = len(data)
731        else:
732            # read data is less than expected:
733            debug('len(data)=%d, size=%d' % (len(data), size))
734            raise IOError ('OLE stream size is less than declared')
735        # when all data is read in memory, StringIO constructor is called
736        StringIO.StringIO.__init__(self, data)
737        # Then the _OleStream object can be used as a read-only file object.
738
739
740#--- _OleDirectoryEntry -------------------------------------------------------
741
742class _OleDirectoryEntry:
743
744    """
745    OLE2 Directory Entry
746    """
747    #[PL] parsing code moved from OleFileIO.loaddirectory
748
749    # struct to parse directory entries:
750    # <: little-endian byte order, standard sizes
751    #    (note: this should guarantee that Q returns a 64 bits int)
752    # 64s: string containing entry name in unicode (max 31 chars) + null char
753    # H: uint16, number of bytes used in name buffer, including null = (len+1)*2
754    # B: uint8, dir entry type (between 0 and 5)
755    # B: uint8, color: 0=black, 1=red
756    # I: uint32, index of left child node in the red-black tree, NOSTREAM if none
757    # I: uint32, index of right child node in the red-black tree, NOSTREAM if none
758    # I: uint32, index of child root node if it is a storage, else NOSTREAM
759    # 16s: CLSID, unique identifier (only used if it is a storage)
760    # I: uint32, user flags
761    # Q (was 8s): uint64, creation timestamp or zero
762    # Q (was 8s): uint64, modification timestamp or zero
763    # I: uint32, SID of first sector if stream or ministream, SID of 1st sector
764    #    of stream containing ministreams if root entry, 0 otherwise
765    # I: uint32, total stream size in bytes if stream (low 32 bits), 0 otherwise
766    # I: uint32, total stream size in bytes if stream (high 32 bits), 0 otherwise
767    STRUCT_DIRENTRY = '<64sHBBIII16sIQQIII'
768    # size of a directory entry: 128 bytes
769    DIRENTRY_SIZE = 128
770    assert struct.calcsize(STRUCT_DIRENTRY) == DIRENTRY_SIZE
771
772
773    def __init__(self, entry, sid, olefile):
774        """
775        Constructor for an _OleDirectoryEntry object.
776        Parses a 128-bytes entry from the OLE Directory stream.
777
778        entry  : string (must be 128 bytes long)
779        sid    : index of this directory entry in the OLE file directory
780        olefile: OleFileIO containing this directory entry
781        """
782        self.sid = sid
783        # ref to olefile is stored for future use
784        self.olefile = olefile
785        # kids is a list of children entries, if this entry is a storage:
786        # (list of _OleDirectoryEntry objects)
787        self.kids = []
788        # kids_dict is a dictionary of children entries, indexed by their
789        # name in lowercase: used to quickly find an entry, and to detect
790        # duplicates
791        self.kids_dict = {}
792        # flag used to detect if the entry is referenced more than once in
793        # directory:
794        self.used = False
795        # decode DirEntry
796        (
797            name,
798            namelength,
799            self.entry_type,
800            self.color,
801            self.sid_left,
802            self.sid_right,
803            self.sid_child,
804            clsid,
805            self.dwUserFlags,
806            self.createTime,
807            self.modifyTime,
808            self.isectStart,
809            sizeLow,
810            sizeHigh
811        ) = struct.unpack(_OleDirectoryEntry.STRUCT_DIRENTRY, entry)
812        if self.entry_type not in [STGTY_ROOT, STGTY_STORAGE, STGTY_STREAM, STGTY_EMPTY]:
813            olefile._raise_defect(DEFECT_INCORRECT, 'unhandled OLE storage type')
814        # only first directory entry can (and should) be root:
815        if self.entry_type == STGTY_ROOT and sid != 0:
816            olefile._raise_defect(DEFECT_INCORRECT, 'duplicate OLE root entry')
817        if sid == 0 and self.entry_type != STGTY_ROOT:
818            olefile._raise_defect(DEFECT_INCORRECT, 'incorrect OLE root entry')
819        #debug (struct.unpack(fmt_entry, entry[:len_entry]))
820        # name should be at most 31 unicode characters + null character,
821        # so 64 bytes in total (31*2 + 2):
822        if namelength>64:
823            olefile._raise_defect(DEFECT_INCORRECT, 'incorrect DirEntry name length')
824            # if exception not raised, namelength is set to the maximum value:
825            namelength = 64
826        # only characters without ending null char are kept:
827        name = name[:(namelength-2)]
828        # name is converted from unicode to Latin-1:
829        self.name = _unicode(name)
830
831        debug('DirEntry SID=%d: %s' % (self.sid, repr(self.name)))
832        debug(' - type: %d' % self.entry_type)
833        debug(' - sect: %d' % self.isectStart)
834        debug(' - SID left: %d, right: %d, child: %d' % (self.sid_left,
835            self.sid_right, self.sid_child))
836
837        # sizeHigh is only used for 4K sectors, it should be zero for 512 bytes
838        # sectors, BUT apparently some implementations set it as 0xFFFFFFFFL, 1
839        # or some other value so it cannot be raised as a defect in general:
840        if olefile.sectorsize == 512:
841            if sizeHigh != 0 and sizeHigh != 0xFFFFFFFFL:
842                debug('sectorsize=%d, sizeLow=%d, sizeHigh=%d (%X)' %
843                    (olefile.sectorsize, sizeLow, sizeHigh, sizeHigh))
844                olefile._raise_defect(DEFECT_UNSURE, 'incorrect OLE stream size')
845            self.size = sizeLow
846        else:
847            self.size = sizeLow + (long(sizeHigh)<<32)
848        debug(' - size: %d (sizeLow=%d, sizeHigh=%d)' % (self.size, sizeLow, sizeHigh))
849
850        self.clsid = _clsid(clsid)
851        # a storage should have a null size, BUT some implementations such as
852        # Word 8 for Mac seem to allow non-null values => Potential defect:
853        if self.entry_type == STGTY_STORAGE and self.size != 0:
854            olefile._raise_defect(DEFECT_POTENTIAL, 'OLE storage with size>0')
855        # check if stream is not already referenced elsewhere:
856        if self.entry_type in (STGTY_ROOT, STGTY_STREAM) and self.size>0:
857            if self.size < olefile.minisectorcutoff \
858            and self.entry_type==STGTY_STREAM: # only streams can be in MiniFAT
859                # ministream object
860                minifat = True
861            else:
862                minifat = False
863            olefile._check_duplicate_stream(self.isectStart, minifat)
864
865
866
867    def build_storage_tree(self):
868        """
869        Read and build the red-black tree attached to this _OleDirectoryEntry
870        object, if it is a storage.
871        Note that this method builds a tree of all subentries, so it should
872        only be called for the root object once.
873        """
874        debug('build_storage_tree: SID=%d - %s - sid_child=%d'
875            % (self.sid, repr(self.name), self.sid_child))
876        if self.sid_child != NOSTREAM:
877            # if child SID is not NOSTREAM, then this entry is a storage.
878            # Let's walk through the tree of children to fill the kids list:
879            self.append_kids(self.sid_child)
880
881            # Note from OpenOffice documentation: the safest way is to
882            # recreate the tree because some implementations may store broken
883            # red-black trees...
884
885            # in the OLE file, entries are sorted on (length, name).
886            # for convenience, we sort them on name instead:
887            # (see __cmp__ method in this class)
888            self.kids.sort()
889
890
891    def append_kids(self, child_sid):
892        """
893        Walk through red-black tree of children of this directory entry to add
894        all of them to the kids list. (recursive method)
895
896        child_sid : index of child directory entry to use, or None when called
897                    first time for the root. (only used during recursion)
898        """
899        #[PL] this method was added to use simple recursion instead of a complex
900        # algorithm.
901        # if this is not a storage or a leaf of the tree, nothing to do:
902        if child_sid == NOSTREAM:
903            return
904        # check if child SID is in the proper range:
905        if child_sid<0 or child_sid>=len(self.olefile.direntries):
906            self.olefile._raise_defect(DEFECT_FATAL, 'OLE DirEntry index out of range')
907        # get child direntry:
908        child = self.olefile._load_direntry(child_sid) #direntries[child_sid]
909        debug('append_kids: child_sid=%d - %s - sid_left=%d, sid_right=%d, sid_child=%d'
910            % (child.sid, repr(child.name), child.sid_left, child.sid_right, child.sid_child))
911        # the directory entries are organized as a red-black tree.
912        # (cf. Wikipedia for details)
913        # First walk through left side of the tree:
914        self.append_kids(child.sid_left)
915        # Check if its name is not already used (case-insensitive):
916        name_lower = child.name.lower()
917        if self.kids_dict.has_key(name_lower):
918            self.olefile._raise_defect(DEFECT_INCORRECT,
919                "Duplicate filename in OLE storage")
920        # Then the child_sid _OleDirectoryEntry object is appended to the
921        # kids list and dictionary:
922        self.kids.append(child)
923        self.kids_dict[name_lower] = child
924        # Check if kid was not already referenced in a storage:
925        if child.used:
926            self.olefile._raise_defect(DEFECT_INCORRECT,
927                'OLE Entry referenced more than once')
928        child.used = True
929        # Finally walk through right side of the tree:
930        self.append_kids(child.sid_right)
931        # Afterwards build kid's own tree if it's also a storage:
932        child.build_storage_tree()
933
934
935    def __cmp__(self, other):
936        "Compare entries by name"
937        return cmp(self.name, other.name)
938        #TODO: replace by the same function as MS implementation ?
939        # (order by name length first, then case-insensitive order)
940
941
942    def dump(self, tab = 0):
943        "Dump this entry, and all its subentries (for debug purposes only)"
944        TYPES = ["(invalid)", "(storage)", "(stream)", "(lockbytes)",
945                 "(property)", "(root)"]
946        print " "*tab + repr(self.name), TYPES[self.entry_type],
947        if self.entry_type in (STGTY_STREAM, STGTY_ROOT):
948            print self.size, "bytes",
949        print
950        if self.entry_type in (STGTY_STORAGE, STGTY_ROOT) and self.clsid:
951            print " "*tab + "{%s}" % self.clsid
952
953        for kid in self.kids:
954            kid.dump(tab + 2)
955
956
957    def getmtime(self):
958        """
959        Return modification time of a directory entry.
960
961        return: None if modification time is null, a python datetime object
962        otherwise (UTC timezone)
963
964        new in version 0.26
965        """
966        if self.modifyTime == 0:
967            return None
968        return filetime2datetime(self.modifyTime)
969
970
971    def getctime(self):
972        """
973        Return creation time of a directory entry.
974
975        return: None if modification time is null, a python datetime object
976        otherwise (UTC timezone)
977
978        new in version 0.26
979        """
980        if self.createTime == 0:
981            return None
982        return filetime2datetime(self.createTime)
983
984
985#--- OleFileIO ----------------------------------------------------------------
986
987class OleFileIO:
988    """
989    OLE container object
990
991    This class encapsulates the interface to an OLE 2 structured
992    storage file.  Use the {@link listdir} and {@link openstream} methods to
993    access the contents of this file.
994
995    Object names are given as a list of strings, one for each subentry
996    level.  The root entry should be omitted.  For example, the following
997    code extracts all image streams from a Microsoft Image Composer file:
998
999        ole = OleFileIO("fan.mic")
1000
1001        for entry in ole.listdir():
1002            if entry[1:2] == "Image":
1003                fin = ole.openstream(entry)
1004                fout = open(entry[0:1], "wb")
1005                while 1:
1006                    s = fin.read(8192)
1007                    if not s:
1008                        break
1009                    fout.write(s)
1010
1011    You can use the viewer application provided with the Python Imaging
1012    Library to view the resulting files (which happens to be standard
1013    TIFF files).
1014    """
1015
1016    def __init__(self, filename = None, raise_defects=DEFECT_FATAL):
1017        """
1018        Constructor for OleFileIO class.
1019
1020        filename: file to open.
1021        raise_defects: minimal level for defects to be raised as exceptions.
1022        (use DEFECT_FATAL for a typical application, DEFECT_INCORRECT for a
1023        security-oriented application, see source code for details)
1024        """
1025        # minimal level for defects to be raised as exceptions:
1026        self._raise_defects_level = raise_defects
1027        # list of defects/issues not raised as exceptions:
1028        # tuples of (exception type, message)
1029        self.parsing_issues = []
1030        if filename:
1031            self.open(filename)
1032
1033
1034    def _raise_defect(self, defect_level, message, exception_type=IOError):
1035        """
1036        This method should be called for any defect found during file parsing.
1037        It may raise an IOError exception according to the minimal level chosen
1038        for the OleFileIO object.
1039
1040        defect_level: defect level, possible values are:
1041            DEFECT_UNSURE    : a case which looks weird, but not sure it's a defect
1042            DEFECT_POTENTIAL : a potential defect
1043            DEFECT_INCORRECT : an error according to specifications, but parsing can go on
1044            DEFECT_FATAL     : an error which cannot be ignored, parsing is impossible
1045        message: string describing the defect, used with raised exception.
1046        exception_type: exception class to be raised, IOError by default
1047        """
1048        # added by [PL]
1049        if defect_level >= self._raise_defects_level:
1050            raise exception_type, message
1051        else:
1052            # just record the issue, no exception raised:
1053            self.parsing_issues.append((exception_type, message))
1054
1055
1056    def open(self, filename):
1057        """
1058        Open an OLE2 file.
1059        Reads the header, FAT and directory.
1060
1061        filename: string-like or file-like object
1062        """
1063        #[PL] check if filename is a string-like or file-like object:
1064        # (it is better to check for a read() method)
1065        if hasattr(filename, 'read'):
1066            # file-like object
1067            self.fp = filename
1068        else:
1069            # string-like object: filename of file on disk
1070            #TODO: if larger than 1024 bytes, this could be the actual data => StringIO
1071            self.fp = open(filename, "rb")
1072        # old code fails if filename is not a plain string:
1073        #if type(filename) == type(""):
1074        #    self.fp = open(filename, "rb")
1075        #else:
1076        #    self.fp = filename
1077        # obtain the filesize by using seek and tell, which should work on most
1078        # file-like objects:
1079        #TODO: do it above, using getsize with filename when possible?
1080        #TODO: fix code to fail with clear exception when filesize cannot be obtained
1081        self.fp.seek(0, os.SEEK_END)
1082        try:
1083            filesize = self.fp.tell()
1084        finally:
1085            self.fp.seek(0)
1086        self._filesize = filesize
1087
1088        # lists of streams in FAT and MiniFAT, to detect duplicate references
1089        # (list of indexes of first sectors of each stream)
1090        self._used_streams_fat = []
1091        self._used_streams_minifat = []
1092
1093        header = self.fp.read(512)
1094
1095        if len(header) != 512 or header[:8] != MAGIC:
1096            self._raise_defect(DEFECT_FATAL, "not an OLE2 structured storage file")
1097
1098        # [PL] header structure according to AAF specifications:
1099        ##Header
1100        ##struct StructuredStorageHeader { // [offset from start (bytes), length (bytes)]
1101        ##BYTE _abSig[8]; // [00H,08] {0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1,
1102        ##                // 0x1a, 0xe1} for current version
1103        ##CLSID _clsid;   // [08H,16] reserved must be zero (WriteClassStg/
1104        ##                // GetClassFile uses root directory class id)
1105        ##USHORT _uMinorVersion; // [18H,02] minor version of the format: 33 is
1106        ##                       // written by reference implementation
1107        ##USHORT _uDllVersion;   // [1AH,02] major version of the dll/format: 3 for
1108        ##                       // 512-byte sectors, 4 for 4 KB sectors
1109        ##USHORT _uByteOrder;    // [1CH,02] 0xFFFE: indicates Intel byte-ordering
1110        ##USHORT _uSectorShift;  // [1EH,02] size of sectors in power-of-two;
1111        ##                       // typically 9 indicating 512-byte sectors
1112        ##USHORT _uMiniSectorShift; // [20H,02] size of mini-sectors in power-of-two;
1113        ##                          // typically 6 indicating 64-byte mini-sectors
1114        ##USHORT _usReserved; // [22H,02] reserved, must be zero
1115        ##ULONG _ulReserved1; // [24H,04] reserved, must be zero
1116        ##FSINDEX _csectDir; // [28H,04] must be zero for 512-byte sectors,
1117        ##                   // number of SECTs in directory chain for 4 KB
1118        ##                   // sectors
1119        ##FSINDEX _csectFat; // [2CH,04] number of SECTs in the FAT chain
1120        ##SECT _sectDirStart; // [30H,04] first SECT in the directory chain
1121        ##DFSIGNATURE _signature; // [34H,04] signature used for transactions; must
1122        ##                        // be zero. The reference implementation
1123        ##                        // does not support transactions
1124        ##ULONG _ulMiniSectorCutoff; // [38H,04] maximum size for a mini stream;
1125        ##                           // typically 4096 bytes
1126        ##SECT _sectMiniFatStart; // [3CH,04] first SECT in the MiniFAT chain
1127        ##FSINDEX _csectMiniFat; // [40H,04] number of SECTs in the MiniFAT chain
1128        ##SECT _sectDifStart; // [44H,04] first SECT in the DIFAT chain
1129        ##FSINDEX _csectDif; // [48H,04] number of SECTs in the DIFAT chain
1130        ##SECT _sectFat[109]; // [4CH,436] the SECTs of first 109 FAT sectors
1131        ##};
1132
1133        # [PL] header decoding:
1134        # '<' indicates little-endian byte ordering for Intel (cf. struct module help)
1135        fmt_header = '<8s16sHHHHHHLLLLLLLLLL'
1136        header_size = struct.calcsize(fmt_header)
1137        debug( "fmt_header size = %d, +FAT = %d" % (header_size, header_size + 109*4) )
1138        header1 = header[:header_size]
1139        (
1140            self.Sig,
1141            self.clsid,
1142            self.MinorVersion,
1143            self.DllVersion,
1144            self.ByteOrder,
1145            self.SectorShift,
1146            self.MiniSectorShift,
1147            self.Reserved, self.Reserved1,
1148            self.csectDir,
1149            self.csectFat,
1150            self.sectDirStart,
1151            self.signature,
1152            self.MiniSectorCutoff,
1153            self.MiniFatStart,
1154            self.csectMiniFat,
1155            self.sectDifStart,
1156            self.csectDif
1157        ) = struct.unpack(fmt_header, header1)
1158        debug( struct.unpack(fmt_header,    header1))
1159
1160        if self.Sig != '\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1':
1161            # OLE signature should always be present
1162            self._raise_defect(DEFECT_FATAL, "incorrect OLE signature")
1163        if self.clsid != '\x00'*16:
1164            # according to AAF specs, CLSID should always be zero
1165            self._raise_defect(DEFECT_INCORRECT, "incorrect CLSID in OLE header")
1166        debug( "MinorVersion = %d" % self.MinorVersion )
1167        debug( "DllVersion   = %d" % self.DllVersion )
1168        if self.DllVersion not in [3, 4]:
1169            # version 3: usual format, 512 bytes per sector
1170            # version 4: large format, 4K per sector
1171            self._raise_defect(DEFECT_INCORRECT, "incorrect DllVersion in OLE header")
1172        debug( "ByteOrder    = %X" % self.ByteOrder )
1173        if self.ByteOrder != 0xFFFE:
1174            # For now only common little-endian documents are handled correctly
1175            self._raise_defect(DEFECT_FATAL, "incorrect ByteOrder in OLE header")
1176            # TODO: add big-endian support for documents created on Mac ?
1177        self.SectorSize = 2**self.SectorShift
1178        debug( "SectorSize   = %d" % self.SectorSize )
1179        if self.SectorSize not in [512, 4096]:
1180            self._raise_defect(DEFECT_INCORRECT, "incorrect SectorSize in OLE header")
1181        if (self.DllVersion==3 and self.SectorSize!=512) \
1182        or (self.DllVersion==4 and self.SectorSize!=4096):
1183            self._raise_defect(DEFECT_INCORRECT, "SectorSize does not match DllVersion in OLE header")
1184        self.MiniSectorSize = 2**self.MiniSectorShift
1185        debug( "MiniSectorSize   = %d" % self.MiniSectorSize )
1186        if self.MiniSectorSize not in [64]:
1187            self._raise_defect(DEFECT_INCORRECT, "incorrect MiniSectorSize in OLE header")
1188        if self.Reserved != 0 or self.Reserved1 != 0:
1189            self._raise_defect(DEFECT_INCORRECT, "incorrect OLE header (non-null reserved bytes)")
1190        debug( "csectDir     = %d" % self.csectDir )
1191        if self.SectorSize==512 and self.csectDir!=0:
1192            self._raise_defect(DEFECT_INCORRECT, "incorrect csectDir in OLE header")
1193        debug( "csectFat     = %d" % self.csectFat )
1194        debug( "sectDirStart = %X" % self.sectDirStart )
1195        debug( "signature    = %d" % self.signature )
1196        # Signature should be zero, BUT some implementations do not follow this
1197        # rule => only a potential defect:
1198        if self.signature != 0:
1199            self._raise_defect(DEFECT_POTENTIAL, "incorrect OLE header (signature>0)")
1200        debug( "MiniSectorCutoff = %d" % self.MiniSectorCutoff )
1201        debug( "MiniFatStart     = %X" % self.MiniFatStart )
1202        debug( "csectMiniFat     = %d" % self.csectMiniFat )
1203        debug( "sectDifStart     = %X" % self.sectDifStart )
1204        debug( "csectDif         = %d" % self.csectDif )
1205
1206        # calculate the number of sectors in the file
1207        # (-1 because header doesn't count)
1208        self.nb_sect = ( (filesize + self.SectorSize-1) / self.SectorSize) - 1
1209        debug( "Number of sectors in the file: %d" % self.nb_sect )
1210
1211        # file clsid (probably never used, so we don't store it)
1212        clsid = _clsid(header[8:24])
1213        self.sectorsize = self.SectorSize #1 << i16(header, 30)
1214        self.minisectorsize = self.MiniSectorSize  #1 << i16(header, 32)
1215        self.minisectorcutoff = self.MiniSectorCutoff # i32(header, 56)
1216
1217        # check known streams for duplicate references (these are always in FAT,
1218        # never in MiniFAT):
1219        self._check_duplicate_stream(self.sectDirStart)
1220        # check MiniFAT only if it is not empty:
1221        if self.csectMiniFat:
1222            self._check_duplicate_stream(self.MiniFatStart)
1223        # check DIFAT only if it is not empty:
1224        if self.csectDif:
1225            self._check_duplicate_stream(self.sectDifStart)
1226
1227        # Load file allocation tables
1228        self.loadfat(header)
1229        # Load direcory.  This sets both the direntries list (ordered by sid)
1230        # and the root (ordered by hierarchy) members.
1231        self.loaddirectory(self.sectDirStart)#i32(header, 48))
1232        self.ministream = None
1233        self.minifatsect = self.MiniFatStart #i32(header, 60)
1234
1235
1236    def close(self):
1237        """
1238        close the OLE file, to release the file object
1239        """
1240        self.fp.close()
1241
1242
1243    def _check_duplicate_stream(self, first_sect, minifat=False):
1244        """
1245        Checks if a stream has not been already referenced elsewhere.
1246        This method should only be called once for each known stream, and only
1247        if stream size is not null.
1248        first_sect: index of first sector of the stream in FAT
1249        minifat: if True, stream is located in the MiniFAT, else in the FAT
1250        """
1251        if minifat:
1252            debug('_check_duplicate_stream: sect=%d in MiniFAT' % first_sect)
1253            used_streams = self._used_streams_minifat
1254        else:
1255            debug('_check_duplicate_stream: sect=%d in FAT' % first_sect)
1256            # some values can be safely ignored (not a real stream):
1257            if first_sect in (DIFSECT,FATSECT,ENDOFCHAIN,FREESECT):
1258                return
1259            used_streams = self._used_streams_fat
1260        #TODO: would it be more efficient using a dict or hash values, instead
1261        #      of a list of long ?
1262        if first_sect in used_streams:
1263            self._raise_defect(DEFECT_INCORRECT, 'Stream referenced twice')
1264        else:
1265            used_streams.append(first_sect)
1266
1267
1268    def dumpfat(self, fat, firstindex=0):
1269        "Displays a part of FAT in human-readable form for debugging purpose"
1270        # [PL] added only for debug
1271        if not DEBUG_MODE:
1272            return
1273        # dictionary to convert special FAT values in human-readable strings
1274        VPL=8 # valeurs par ligne (8+1 * 8+1 = 81)
1275        fatnames = {
1276            FREESECT:   "..free..",
1277            ENDOFCHAIN: "[ END. ]",
1278            FATSECT:    "FATSECT ",
1279            DIFSECT:    "DIFSECT "
1280            }
1281        nbsect = len(fat)
1282        nlines = (nbsect+VPL-1)/VPL
1283        print "index",
1284        for i in range(VPL):
1285            print ("%8X" % i),
1286        print ""
1287        for l in range(nlines):
1288            index = l*VPL
1289            print ("%8X:" % (firstindex+index)),
1290            for i in range(index, index+VPL):
1291                if i>=nbsect:
1292                    break
1293                sect = fat[i]
1294                if sect in fatnames:
1295                    nom = fatnames[sect]
1296                else:
1297                    if sect == i+1:
1298                        nom = "    --->"
1299                    else:
1300                        nom = "%8X" % sect
1301                print nom,
1302            print ""
1303
1304
1305    def dumpsect(self, sector, firstindex=0):
1306        "Displays a sector in a human-readable form, for debugging purpose."
1307        if not DEBUG_MODE:
1308            return
1309        VPL=8 # number of values per line (8+1 * 8+1 = 81)
1310        tab = array.array(UINT32, sector)
1311        nbsect = len(tab)
1312        nlines = (nbsect+VPL-1)/VPL
1313        print "index",
1314        for i in range(VPL):
1315            print ("%8X" % i),
1316        print ""
1317        for l in range(nlines):
1318            index = l*VPL
1319            print ("%8X:" % (firstindex+index)),
1320            for i in range(index, index+VPL):
1321                if i>=nbsect:
1322                    break
1323                sect = tab[i]
1324                nom = "%8X" % sect
1325                print nom,
1326            print ""
1327
1328    def sect2array(self, sect):
1329        """
1330        convert a sector to an array of 32 bits unsigned integers,
1331        swapping bytes on big endian CPUs such as PowerPC (old Macs)
1332        """
1333        a = array.array(UINT32, sect)
1334        # if CPU is big endian, swap bytes:
1335        if sys.byteorder == 'big':
1336            a.byteswap()
1337        return a
1338
1339
1340    def loadfat_sect(self, sect):
1341        """
1342        Adds the indexes of the given sector to the FAT
1343        sect: string containing the first FAT sector, or array of long integers
1344        return: index of last FAT sector.
1345        """
1346        # a FAT sector is an array of ulong integers.
1347        if isinstance(sect, array.array):
1348            # if sect is already an array it is directly used
1349            fat1 = sect
1350        else:
1351            # if it's a raw sector, it is parsed in an array
1352            fat1 = self.sect2array(sect)
1353            self.dumpsect(sect)
1354        # The FAT is a sector chain starting at the first index of itself.
1355        for isect in fat1:
1356            #print "isect = %X" % isect
1357            if isect == ENDOFCHAIN or isect == FREESECT:
1358                # the end of the sector chain has been reached
1359                break
1360            # read the FAT sector
1361            s = self.getsect(isect)
1362            # parse it as an array of 32 bits integers, and add it to the
1363            # global FAT array
1364            nextfat = self.sect2array(s)
1365            self.fat = self.fat + nextfat
1366        return isect
1367
1368
1369    def loadfat(self, header):
1370        """
1371        Load the FAT table.
1372        """
1373        # The header contains a sector  numbers
1374        # for the first 109 FAT sectors.  Additional sectors are
1375        # described by DIF blocks
1376
1377        sect = header[76:512]
1378        debug( "len(sect)=%d, so %d integers" % (len(sect), len(sect)/4) )
1379        #fat    = []
1380        # [PL] FAT is an array of 32 bits unsigned ints, it's more effective
1381        # to use an array than a list in Python.
1382        # It's initialized as empty first:
1383        self.fat = array.array(UINT32)
1384        self.loadfat_sect(sect)
1385        #self.dumpfat(self.fat)
1386##      for i in range(0, len(sect), 4):
1387##          ix = i32(sect, i)
1388##          #[PL] if ix == -2 or ix == -1: # ix == 0xFFFFFFFEL or ix == 0xFFFFFFFFL:
1389##          if ix == 0xFFFFFFFEL or ix == 0xFFFFFFFFL:
1390##              break
1391##          s = self.getsect(ix)
1392##          #fat    = fat + map(lambda i, s=s: i32(s, i), range(0, len(s), 4))
1393##          fat = fat + array.array(UINT32, s)
1394        if self.csectDif != 0:
1395            # [PL] There's a DIFAT because file is larger than 6.8MB
1396            # some checks just in case:
1397            if self.csectFat <= 109:
1398                # there must be at least 109 blocks in header and the rest in
1399                # DIFAT, so number of sectors must be >109.
1400                self._raise_defect(DEFECT_INCORRECT, 'incorrect DIFAT, not enough sectors')
1401            if self.sectDifStart >= self.nb_sect:
1402                # initial DIFAT block index must be valid
1403                self._raise_defect(DEFECT_FATAL, 'incorrect DIFAT, first index out of range')
1404            debug( "DIFAT analysis..." )
1405            # We compute the necessary number of DIFAT sectors :
1406            # (each DIFAT sector = 127 pointers + 1 towards next DIFAT sector)
1407            nb_difat = (self.csectFat-109 + 126)/127
1408            debug( "nb_difat = %d" % nb_difat )
1409            if self.csectDif != nb_difat:
1410                raise IOError, 'incorrect DIFAT'
1411            isect_difat = self.sectDifStart
1412            for i in xrange(nb_difat):
1413                debug( "DIFAT block %d, sector %X" % (i, isect_difat) )
1414                #TODO: check if corresponding FAT SID = DIFSECT
1415                sector_difat = self.getsect(isect_difat)
1416                difat = self.sect2array(sector_difat)
1417                self.dumpsect(sector_difat)
1418                self.loadfat_sect(difat[:127])
1419                # last DIFAT pointer is next DIFAT sector:
1420                isect_difat = difat[127]
1421                debug( "next DIFAT sector: %X" % isect_difat )
1422            # checks:
1423            if isect_difat not in [ENDOFCHAIN, FREESECT]:
1424                # last DIFAT pointer value must be ENDOFCHAIN or FREESECT
1425                raise IOError, 'incorrect end of DIFAT'
1426##          if len(self.fat) != self.csectFat:
1427##              # FAT should contain csectFat blocks
1428##              print "FAT length: %d instead of %d" % (len(self.fat), self.csectFat)
1429##              raise IOError, 'incorrect DIFAT'
1430        # since FAT is read from fixed-size sectors, it may contain more values
1431        # than the actual number of sectors in the file.
1432        # Keep only the relevant sector indexes:
1433        if len(self.fat) > self.nb_sect:
1434            debug('len(fat)=%d, shrunk to nb_sect=%d' % (len(self.fat), self.nb_sect))
1435            self.fat = self.fat[:self.nb_sect]
1436        debug('\nFAT:')
1437        self.dumpfat(self.fat)
1438
1439
1440    def loadminifat(self):
1441        """
1442        Load the MiniFAT table.
1443        """
1444        # MiniFAT is stored in a standard  sub-stream, pointed to by a header
1445        # field.
1446        # NOTE: there are two sizes to take into account for this stream:
1447        # 1) Stream size is calculated according to the number of sectors
1448        #    declared in the OLE header. This allocated stream may be more than
1449        #    needed to store the actual sector indexes.
1450        # (self.csectMiniFat is the number of sectors of size self.SectorSize)
1451        stream_size = self.csectMiniFat * self.SectorSize
1452        # 2) Actually used size is calculated by dividing the MiniStream size
1453        #    (given by root entry size) by the size of mini sectors, *4 for
1454        #    32 bits indexes:
1455        nb_minisectors = (self.root.size + self.MiniSectorSize-1) / self.MiniSectorSize
1456        used_size = nb_minisectors * 4
1457        debug('loadminifat(): minifatsect=%d, nb FAT sectors=%d, used_size=%d, stream_size=%d, nb MiniSectors=%d' %
1458            (self.minifatsect, self.csectMiniFat, used_size, stream_size, nb_minisectors))
1459        if used_size > stream_size:
1460            # This is not really a problem, but may indicate a wrong implementation:
1461            self._raise_defect(DEFECT_INCORRECT, 'OLE MiniStream is larger than MiniFAT')
1462        # In any case, first read stream_size:
1463        s = self._open(self.minifatsect, stream_size, force_FAT=True).read()
1464        #[PL] Old code replaced by an array:
1465        #self.minifat = map(lambda i, s=s: i32(s, i), range(0, len(s), 4))
1466        self.minifat = self.sect2array(s)
1467        # Then shrink the array to used size, to avoid indexes out of MiniStream:
1468        debug('MiniFAT shrunk from %d to %d sectors' % (len(self.minifat), nb_minisectors))
1469        self.minifat = self.minifat[:nb_minisectors]
1470        debug('loadminifat(): len=%d' % len(self.minifat))
1471        debug('\nMiniFAT:')
1472        self.dumpfat(self.minifat)
1473
1474    def getsect(self, sect):
1475        """
1476        Read given sector from file on disk.
1477        sect: sector index
1478        returns a string containing the sector data.
1479        """
1480        # [PL] this original code was wrong when sectors are 4KB instead of
1481        # 512 bytes:
1482        #self.fp.seek(512 + self.sectorsize * sect)
1483        #[PL]: added safety checks:
1484        #print "getsect(%X)" % sect
1485        try:
1486            self.fp.seek(self.sectorsize * (sect+1))
1487        except:
1488            debug('getsect(): sect=%X, seek=%d, filesize=%d' %
1489                (sect, self.sectorsize*(sect+1), self._filesize))
1490            self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range')
1491        sector = self.fp.read(self.sectorsize)
1492        if len(sector) != self.sectorsize:
1493            debug('getsect(): sect=%X, read=%d, sectorsize=%d' %
1494                (sect, len(sector), self.sectorsize))
1495            self._raise_defect(DEFECT_FATAL, 'incomplete OLE sector')
1496        return sector
1497
1498
1499    def loaddirectory(self, sect):
1500        """
1501        Load the directory.
1502        sect: sector index of directory stream.
1503        """
1504        # The directory is  stored in a standard
1505        # substream, independent of its size.
1506
1507        # open directory stream as a read-only file:
1508        # (stream size is not known in advance)
1509        self.directory_fp = self._open(sect)
1510
1511        #[PL] to detect malformed documents and avoid DoS attacks, the maximum
1512        # number of directory entries can be calculated:
1513        max_entries = self.directory_fp.size / 128
1514        debug('loaddirectory: size=%d, max_entries=%d' %
1515            (self.directory_fp.size, max_entries))
1516
1517        # Create list of directory entries
1518        #self.direntries = []
1519        # We start with a list of "None" object
1520        self.direntries = [None] * max_entries
1521##        for sid in xrange(max_entries):
1522##            entry = fp.read(128)
1523##            if not entry:
1524##                break
1525##            self.direntries.append(_OleDirectoryEntry(entry, sid, self))
1526        # load root entry:
1527        root_entry = self._load_direntry(0)
1528        # Root entry is the first entry:
1529        self.root = self.direntries[0]
1530        # read and build all storage trees, starting from the root:
1531        self.root.build_storage_tree()
1532
1533
1534    def _load_direntry (self, sid):
1535        """
1536        Load a directory entry from the directory.
1537        This method should only be called once for each storage/stream when
1538        loading the directory.
1539        sid: index of storage/stream in the directory.
1540        return: a _OleDirectoryEntry object
1541        raise: IOError if the entry has always been referenced.
1542        """
1543        # check if SID is OK:
1544        if sid<0 or sid>=len(self.direntries):
1545            self._raise_defect(DEFECT_FATAL, "OLE directory index out of range")
1546        # check if entry was already referenced:
1547        if self.direntries[sid] is not None:
1548            self._raise_defect(DEFECT_INCORRECT,
1549                "double reference for OLE stream/storage")
1550            # if exception not raised, return the object
1551            return self.direntries[sid]
1552        self.directory_fp.seek(sid * 128)
1553        entry = self.directory_fp.read(128)
1554        self.direntries[sid] = _OleDirectoryEntry(entry, sid, self)
1555        return self.direntries[sid]
1556
1557
1558    def dumpdirectory(self):
1559        """
1560        Dump directory (for debugging only)
1561        """
1562        self.root.dump()
1563
1564
1565    def _open(self, start, size = 0x7FFFFFFF, force_FAT=False):
1566        """
1567        Open a stream, either in FAT or MiniFAT according to its size.
1568        (openstream helper)
1569
1570        start: index of first sector
1571        size: size of stream (or nothing if size is unknown)
1572        force_FAT: if False (default), stream will be opened in FAT or MiniFAT
1573                   according to size. If True, it will always be opened in FAT.
1574        """
1575        debug('OleFileIO.open(): sect=%d, size=%d, force_FAT=%s' %
1576            (start, size, str(force_FAT)))
1577        # stream size is compared to the MiniSectorCutoff threshold:
1578        if size < self.minisectorcutoff and not force_FAT:
1579            # ministream object
1580            if not self.ministream:
1581                # load MiniFAT if it wasn't already done:
1582                self.loadminifat()
1583                # The first sector index of the miniFAT stream is stored in the
1584                # root directory entry:
1585                size_ministream = self.root.size
1586                debug('Opening MiniStream: sect=%d, size=%d' %
1587                    (self.root.isectStart, size_ministream))
1588                self.ministream = self._open(self.root.isectStart,
1589                    size_ministream, force_FAT=True)
1590            return _OleStream(self.ministream, start, size, 0,
1591                              self.minisectorsize, self.minifat,
1592                              self.ministream.size)
1593        else:
1594            # standard stream
1595            return _OleStream(self.fp, start, size, 512,
1596                              self.sectorsize, self.fat, self._filesize)
1597
1598
1599    def _list(self, files, prefix, node, streams=True, storages=False):
1600        """
1601        (listdir helper)
1602        files: list of files to fill in
1603        prefix: current location in storage tree (list of names)
1604        node: current node (_OleDirectoryEntry object)
1605        streams: bool, include streams if True (True by default) - new in v0.26
1606        storages: bool, include storages if True (False by default) - new in v0.26
1607        (note: the root storage is never included)
1608        """
1609        prefix = prefix + [node.name]
1610        for entry in node.kids:
1611            if entry.kids:
1612                # this is a storage
1613                if storages:
1614                    # add it to the list
1615                    files.append(prefix[1:] + [entry.name])
1616                # check its kids
1617                self._list(files, prefix, entry, streams, storages)
1618            else:
1619                # this is a stream
1620                if streams:
1621                    # add it to the list
1622                    files.append(prefix[1:] + [entry.name])
1623
1624
1625    def listdir(self, streams=True, storages=False):
1626        """
1627        Return a list of streams stored in this file
1628
1629        streams: bool, include streams if True (True by default) - new in v0.26
1630        storages: bool, include storages if True (False by default) - new in v0.26
1631        (note: the root storage is never included)
1632        """
1633        files = []
1634        self._list(files, [], self.root, streams, storages)
1635        return files
1636
1637
1638    def _find(self, filename):
1639        """
1640        Returns directory entry of given filename. (openstream helper)
1641        Note: this method is case-insensitive.
1642
1643        filename: path of stream in storage tree (except root entry), either:
1644            - a string using Unix path syntax, for example:
1645              'storage_1/storage_1.2/stream'
1646            - a list of storage filenames, path to the desired stream/storage.
1647              Example: ['storage_1', 'storage_1.2', 'stream']
1648        return: sid of requested filename
1649        raise IOError if file not found
1650        """
1651
1652        # if filename is a string instead of a list, split it on slashes to
1653        # convert to a list:
1654        if isinstance(filename, basestring):
1655            filename = filename.split('/')
1656        # walk across storage tree, following given path:
1657        node = self.root
1658        for name in filename:
1659            for kid in node.kids:
1660                if kid.name.lower() == name.lower():
1661                    break
1662            else:
1663                raise IOError, "file not found"
1664            node = kid
1665        return node.sid
1666
1667
1668    def openstream(self, filename):
1669        """
1670        Open a stream as a read-only file object (StringIO).
1671
1672        filename: path of stream in storage tree (except root entry), either:
1673            - a string using Unix path syntax, for example:
1674              'storage_1/storage_1.2/stream'
1675            - a list of storage filenames, path to the desired stream/storage.
1676              Example: ['storage_1', 'storage_1.2', 'stream']
1677        return: file object (read-only)
1678        raise IOError if filename not found, or if this is not a stream.
1679        """
1680        sid = self._find(filename)
1681        entry = self.direntries[sid]
1682        if entry.entry_type != STGTY_STREAM:
1683            raise IOError, "this file is not a stream"
1684        return self._open(entry.isectStart, entry.size)
1685
1686
1687    def get_type(self, filename):
1688        """
1689        Test if given filename exists as a stream or a storage in the OLE
1690        container, and return its type.
1691
1692        filename: path of stream in storage tree. (see openstream for syntax)
1693        return: False if object does not exist, its entry type (>0) otherwise:
1694            - STGTY_STREAM: a stream
1695            - STGTY_STORAGE: a storage
1696            - STGTY_ROOT: the root entry
1697        """
1698        try:
1699            sid = self._find(filename)
1700            entry = self.direntries[sid]
1701            return entry.entry_type
1702        except:
1703            return False
1704
1705
1706    def getmtime(self, filename):
1707        """
1708        Return modification time of a stream/storage.
1709
1710        filename: path of stream/storage in storage tree. (see openstream for
1711        syntax)
1712        return: None if modification time is null, a python datetime object
1713        otherwise (UTC timezone)
1714
1715        new in version 0.26
1716        """
1717        sid = self._find(filename)
1718        entry = self.direntries[sid]
1719        return entry.getmtime()
1720
1721
1722    def getctime(self, filename):
1723        """
1724        Return creation time of a stream/storage.
1725
1726        filename: path of stream/storage in storage tree. (see openstream for
1727        syntax)
1728        return: None if creation time is null, a python datetime object
1729        otherwise (UTC timezone)
1730
1731        new in version 0.26
1732        """
1733        sid = self._find(filename)
1734        entry = self.direntries[sid]
1735        return entry.getctime()
1736
1737
1738    def exists(self, filename):
1739        """
1740        Test if given filename exists as a stream or a storage in the OLE
1741        container.
1742
1743        filename: path of stream in storage tree. (see openstream for syntax)
1744        return: True if object exist, else False.
1745        """
1746        try:
1747            sid = self._find(filename)
1748            return True
1749        except:
1750            return False
1751
1752
1753    def get_size(self, filename):
1754        """
1755        Return size of a stream in the OLE container, in bytes.
1756
1757        filename: path of stream in storage tree (see openstream for syntax)
1758        return: size in bytes (long integer)
1759        raise: IOError if file not found, TypeError if this is not a stream.
1760        """
1761        sid = self._find(filename)
1762        entry = self.direntries[sid]
1763        if entry.entry_type != STGTY_STREAM:
1764            #TODO: Should it return zero instead of raising an exception ?
1765            raise TypeError, 'object is not an OLE stream'
1766        return entry.size
1767
1768
1769    def get_rootentry_name(self):
1770        """
1771        Return root entry name. Should usually be 'Root Entry' or 'R' in most
1772        implementations.
1773        """
1774        return self.root.name
1775
1776
1777    def getproperties(self, filename, convert_time=False, no_conversion=None):
1778        """
1779        Return properties described in substream.
1780
1781        filename: path of stream in storage tree (see openstream for syntax)
1782        convert_time: bool, if True timestamps will be converted to Python datetime
1783        no_conversion: None or list of int, timestamps not to be converted
1784                       (for example total editing time is not a real timestamp)
1785        return: a dictionary of values indexed by id (integer)
1786        """
1787        # make sure no_conversion is a list, just to simplify code below:
1788        if no_conversion == None:
1789            no_conversion = []
1790        # stream path as a string to report exceptions:
1791        streampath = filename
1792        if not isinstance(streampath, str):
1793            streampath = '/'.join(streampath)
1794
1795        fp = self.openstream(filename)
1796
1797        data = {}
1798
1799        try:
1800            # header
1801            s = fp.read(28)
1802            clsid = _clsid(s[8:24])
1803
1804            # format id
1805            s = fp.read(20)
1806            fmtid = _clsid(s[:16])
1807            fp.seek(i32(s, 16))
1808
1809            # get section
1810            s = "****" + fp.read(i32(fp.read(4))-4)
1811            # number of properties:
1812            num_props = i32(s, 4)
1813        except:
1814            # catch exception while parsing property header, and only raise
1815            # a DEFECT_INCORRECT then return an empty dict, because this is not
1816            # a fatal error when parsing the whole file
1817            exctype, excvalue = sys.exc_info()[:2]
1818            msg = 'Error while parsing properties header in stream %s: %s' % (
1819                repr(streampath), excvalue)
1820            self._raise_defect(DEFECT_INCORRECT, msg, exctype)
1821            return data
1822
1823        for i in range(num_props):
1824            try:
1825                id = 0 # just in case of an exception
1826                id = i32(s, 8+i*8)
1827                offset = i32(s, 12+i*8)
1828                type = i32(s, offset)
1829
1830                debug ('property id=%d: type=%d offset=%X' % (id, type, offset))
1831
1832                # test for common types first (should perhaps use
1833                # a dictionary instead?)
1834
1835                if type == VT_I2: # 16-bit signed integer
1836                    value = i16(s, offset+4)
1837                    if value >= 32768:
1838                        value = value - 65536
1839                elif type == VT_UI2: # 2-byte unsigned integer
1840                    value = i16(s, offset+4)
1841                elif type in (VT_I4, VT_INT, VT_ERROR):
1842                    # VT_I4: 32-bit signed integer
1843                    # VT_ERROR: HRESULT, similar to 32-bit signed integer,
1844                    # see http://msdn.microsoft.com/en-us/library/cc230330.aspx
1845                    value = i32(s, offset+4)
1846                elif type in (VT_UI4, VT_UINT): # 4-byte unsigned integer
1847                    value = i32(s, offset+4) # FIXME
1848                elif type in (VT_BSTR, VT_LPSTR):
1849                    # CodePageString, see http://msdn.microsoft.com/en-us/library/dd942354.aspx
1850                    # size is a 32 bits integer, including the null terminator, and
1851                    # possibly trailing or embedded null chars
1852                    #TODO: if codepage is unicode, the string should be converted as such
1853                    count = i32(s, offset+4)
1854                    value = s[offset+8:offset+8+count-1]
1855                    # remove all null chars:
1856                    value = value.replace('\x00', '')
1857                elif type == VT_BLOB:
1858                    # binary large object (BLOB)
1859                    # see http://msdn.microsoft.com/en-us/library/dd942282.aspx
1860                    count = i32(s, offset+4)
1861                    value = s[offset+8:offset+8+count]
1862                elif type == VT_LPWSTR:
1863                    # UnicodeString
1864                    # see http://msdn.microsoft.com/en-us/library/dd942313.aspx
1865                    # "the string should NOT contain embedded or additional trailing
1866                    # null characters."
1867                    count = i32(s, offset+4)
1868                    value = _unicode(s[offset+8:offset+8+count*2])
1869                elif type == VT_FILETIME:
1870                    value = long(i32(s, offset+4)) + (long(i32(s, offset+8))<<32)
1871                    # FILETIME is a 64-bit int: "number of 100ns periods
1872                    # since Jan 1,1601".
1873                    if convert_time and id not in no_conversion:
1874                        debug('Converting property #%d to python datetime, value=%d=%fs'
1875                                %(id, value, float(value)/10000000L))
1876                        # convert FILETIME to Python datetime.datetime
1877                        # inspired from http://code.activestate.com/recipes/511425-filetime-to-datetime/
1878                        _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0)
1879                        debug('timedelta days=%d' % (value/(10*1000000*3600*24)))
1880                        value = _FILETIME_null_date + datetime.timedelta(microseconds=value/10)
1881                    else:
1882                        # legacy code kept for backward compatibility: returns a
1883                        # number of seconds since Jan 1,1601
1884                        value = value / 10000000L # seconds
1885                elif type == VT_UI1: # 1-byte unsigned integer
1886                    value = ord(s[offset+4])
1887                elif type == VT_CLSID:
1888                    value = _clsid(s[offset+4:offset+20])
1889                elif type == VT_CF:
1890                    # PropertyIdentifier or ClipboardData??
1891                    # see http://msdn.microsoft.com/en-us/library/dd941945.aspx
1892                    count = i32(s, offset+4)
1893                    value = s[offset+8:offset+8+count]
1894                elif type == VT_BOOL:
1895                    # VARIANT_BOOL, 16 bits bool, 0x0000=Fals, 0xFFFF=True
1896                    # see http://msdn.microsoft.com/en-us/library/cc237864.aspx
1897                    value = bool(i16(s, offset+4))
1898                else:
1899                    value = None # everything else yields "None"
1900                    debug ('property id=%d: type=%d not implemented in parser yet' % (id, type))
1901
1902                # missing: VT_EMPTY, VT_NULL, VT_R4, VT_R8, VT_CY, VT_DATE,
1903                # VT_DECIMAL, VT_I1, VT_I8, VT_UI8,
1904                # see http://msdn.microsoft.com/en-us/library/dd942033.aspx
1905
1906                # FIXME: add support for VT_VECTOR
1907                # VT_VECTOR is a 32 uint giving the number of items, followed by
1908                # the items in sequence. The VT_VECTOR value is combined with the
1909                # type of items, e.g. VT_VECTOR|VT_BSTR
1910                # see http://msdn.microsoft.com/en-us/library/dd942011.aspx
1911
1912                #print "%08x" % id, repr(value),
1913                #print "(%s)" % VT[i32(s, offset) & 0xFFF]
1914
1915                data[id] = value
1916            except:
1917                # catch exception while parsing each property, and only raise
1918                # a DEFECT_INCORRECT, because parsing can go on
1919                exctype, excvalue = sys.exc_info()[:2]
1920                msg = 'Error while parsing property id %d in stream %s: %s' % (
1921                    id, repr(streampath), excvalue)
1922                self._raise_defect(DEFECT_INCORRECT, msg, exctype)
1923
1924        return data
1925
1926    def get_metadata(self):
1927        """
1928        Parse standard properties streams, return an OleMetadata object
1929        containing all the available metadata.
1930        (also stored in the metadata attribute of the OleFileIO object)
1931
1932        new in version 0.25
1933        """
1934        self.metadata = OleMetadata()
1935        self.metadata.parse_properties(self)
1936        return self.metadata
1937
1938#
1939# --------------------------------------------------------------------
1940# This script can be used to dump the directory of any OLE2 structured
1941# storage file.
1942
1943if __name__ == "__main__":
1944
1945    import sys
1946
1947    # [PL] display quick usage info if launched from command-line
1948    if len(sys.argv) <= 1:
1949        print __doc__
1950        print """
1951Launched from command line, this script parses OLE files and prints info.
1952
1953Usage: OleFileIO_PL.py [-d] [-c] <file> [file2 ...]
1954
1955Options:
1956-d : debug mode (display a lot of debug information, for developers only)
1957-c : check all streams (for debugging purposes)
1958"""
1959        sys.exit()
1960
1961    check_streams = False
1962    for filename in sys.argv[1:]:
1963##      try:
1964            # OPTIONS:
1965            if filename == '-d':
1966                # option to switch debug mode on:
1967                set_debug_mode(True)
1968                continue
1969            if filename == '-c':
1970                # option to switch check streams mode on:
1971                check_streams = True
1972                continue
1973
1974            ole = OleFileIO(filename)#, raise_defects=DEFECT_INCORRECT)
1975            print "-" * 68
1976            print filename
1977            print "-" * 68
1978            ole.dumpdirectory()
1979            for streamname in ole.listdir():
1980                if streamname[-1][0] == "\005":
1981                    print streamname, ": properties"
1982                    props = ole.getproperties(streamname, convert_time=True)
1983                    props = props.items()
1984                    props.sort()
1985                    for k, v in props:
1986                        #[PL]: avoid to display too large or binary values:
1987                        if isinstance(v, basestring):
1988                            if len(v) > 50:
1989                                v = v[:50]
1990                            # quick and dirty binary check:
1991                            for c in (1,2,3,4,5,6,7,11,12,14,15,16,17,18,19,20,
1992                                21,22,23,24,25,26,27,28,29,30,31):
1993                                if chr(c) in v:
1994                                    v = '(binary data)'
1995                                    break
1996                        print "   ", k, v
1997
1998            if check_streams:
1999                # Read all streams to check if there are errors:
2000                print '\nChecking streams...'
2001                for streamname in ole.listdir():
2002                    # print name using repr() to convert binary chars to \xNN:
2003                    print '-', repr('/'.join(streamname)),'-',
2004                    st_type = ole.get_type(streamname)
2005                    if st_type == STGTY_STREAM:
2006                        print 'size %d' % ole.get_size(streamname)
2007                        # just try to read stream in memory:
2008                        ole.openstream(streamname)
2009                    else:
2010                        print 'NOT a stream : type=%d' % st_type
2011                print ''
2012
2013##            for streamname in ole.listdir():
2014##                # print name using repr() to convert binary chars to \xNN:
2015##                print '-', repr('/'.join(streamname)),'-',
2016##                print ole.getmtime(streamname)
2017##            print ''
2018
2019            print 'Modification/Creation times of all directory entries:'
2020            for entry in ole.direntries:
2021                if entry is not None:
2022                    print '- %s: mtime=%s ctime=%s' % (entry.name,
2023                        entry.getmtime(), entry.getctime())
2024            print ''
2025
2026            # parse and display metadata:
2027            meta = ole.get_metadata()
2028            meta.dump()
2029            print ''
2030            #[PL] Test a few new methods:
2031            root = ole.get_rootentry_name()
2032            print 'Root entry name: "%s"' % root
2033            if ole.exists('worddocument'):
2034                print "This is a Word document."
2035                print "type of stream 'WordDocument':", ole.get_type('worddocument')
2036                print "size :", ole.get_size('worddocument')
2037                if ole.exists('macros/vba'):
2038                    print "This document may contain VBA macros."
2039
2040            # print parsing issues:
2041            print '\nNon-fatal issues raised during parsing:'
2042            if ole.parsing_issues:
2043                for exctype, msg in ole.parsing_issues:
2044                    print '- %s: %s' % (exctype.__name__, msg)
2045            else:
2046                print 'None'
2047##      except IOError, v:
2048##          print "***", "cannot read", file, "-", v