· 7 years ago · Jan 10, 2019, 10:12 AM
1#!/usr/local/bin/python
2# -*- coding: latin-1 -*-
3"""
4OleFileIO_PL:
5 Module to read Microsoft OLE2 files (also called Structured Storage or
6 Microsoft Compound Document File Format), such as Microsoft Office
7 documents, Image Composer and FlashPix files, Outlook messages, ...
8
9version 0.26 2013-07-24 Philippe Lagadec - http://www.decalage.info
10
11Project website: http://www.decalage.info/python/olefileio
12
13Improved version of the OleFileIO module from PIL library v1.1.6
14See: http://www.pythonware.com/products/pil/index.htm
15
16The Python Imaging Library (PIL) is
17 Copyright (c) 1997-2005 by Secret Labs AB
18 Copyright (c) 1995-2005 by Fredrik Lundh
19OleFileIO_PL changes are Copyright (c) 2005-2013 by Philippe Lagadec
20
21See source code and LICENSE.txt for information on usage and redistribution.
22
23WARNING: THIS IS (STILL) WORK IN PROGRESS.
24"""
25
26__author__ = "Philippe Lagadec, Fredrik Lundh (Secret Labs AB)"
27__date__ = "2013-07-24"
28__version__ = '0.26'
29
30#--- LICENSE ------------------------------------------------------------------
31
32# OleFileIO_PL is an improved version of the OleFileIO module from the
33# Python Imaging Library (PIL).
34
35# OleFileIO_PL changes are Copyright (c) 2005-2013 by Philippe Lagadec
36#
37# The Python Imaging Library (PIL) is
38# Copyright (c) 1997-2005 by Secret Labs AB
39# Copyright (c) 1995-2005 by Fredrik Lundh
40#
41# By obtaining, using, and/or copying this software and/or its associated
42# documentation, you agree that you have read, understood, and will comply with
43# the following terms and conditions:
44#
45# Permission to use, copy, modify, and distribute this software and its
46# associated documentation for any purpose and without fee is hereby granted,
47# provided that the above copyright notice appears in all copies, and that both
48# that copyright notice and this permission notice appear in supporting
49# documentation, and that the name of Secret Labs AB or the author(s) not be used
50# in advertising or publicity pertaining to distribution of the software
51# without specific, written prior permission.
52#
53# SECRET LABS AB AND THE AUTHORS DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
54# SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
55# IN NO EVENT SHALL SECRET LABS AB OR THE AUTHORS BE LIABLE FOR ANY SPECIAL,
56# INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
57# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
58# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
59# PERFORMANCE OF THIS SOFTWARE.
60
61#-----------------------------------------------------------------------------
62# CHANGELOG: (only OleFileIO_PL changes compared to PIL 1.1.6)
63# 2005-05-11 v0.10 PL: - a few fixes for Python 2.4 compatibility
64# (all changes flagged with [PL])
65# 2006-02-22 v0.11 PL: - a few fixes for some Office 2003 documents which raise
66# exceptions in _OleStream.__init__()
67# 2006-06-09 v0.12 PL: - fixes for files above 6.8MB (DIFAT in loadfat)
68# - added some constants
69# - added header values checks
70# - added some docstrings
71# - getsect: bugfix in case sectors >512 bytes
72# - getsect: added conformity checks
73# - DEBUG_MODE constant to activate debug display
74# 2007-09-04 v0.13 PL: - improved/translated (lots of) comments
75# - updated license
76# - converted tabs to 4 spaces
77# 2007-11-19 v0.14 PL: - added OleFileIO._raise_defect() to adapt sensitivity
78# - improved _unicode() to use Python 2.x unicode support
79# - fixed bug in _OleDirectoryEntry
80# 2007-11-25 v0.15 PL: - added safety checks to detect FAT loops
81# - fixed _OleStream which didn't check stream size
82# - added/improved many docstrings and comments
83# - moved helper functions _unicode and _clsid out of
84# OleFileIO class
85# - improved OleFileIO._find() to add Unix path syntax
86# - OleFileIO._find() is now case-insensitive
87# - added get_type() and get_rootentry_name()
88# - rewritten loaddirectory and _OleDirectoryEntry
89# 2007-11-27 v0.16 PL: - added _OleDirectoryEntry.kids_dict
90# - added detection of duplicate filenames in storages
91# - added detection of duplicate references to streams
92# - added get_size() and exists() to _OleDirectoryEntry
93# - added isOleFile to check header before parsing
94# - added __all__ list to control public keywords in pydoc
95# 2007-12-04 v0.17 PL: - added _load_direntry to fix a bug in loaddirectory
96# - improved _unicode(), added workarounds for Python <2.3
97# - added set_debug_mode and -d option to set debug mode
98# - fixed bugs in OleFileIO.open and _OleDirectoryEntry
99# - added safety check in main for large or binary
100# properties
101# - allow size>0 for storages for some implementations
102# 2007-12-05 v0.18 PL: - fixed several bugs in handling of FAT, MiniFAT and
103# streams
104# - added option '-c' in main to check all streams
105# 2009-12-10 v0.19 PL: - bugfix for 32 bit arrays on 64 bits platforms
106# (thanks to Ben G. and Martijn for reporting the bug)
107# 2009-12-11 v0.20 PL: - bugfix in OleFileIO.open when filename is not plain str
108# 2010-01-22 v0.21 PL: - added support for big-endian CPUs such as PowerPC Macs
109# 2012-02-16 v0.22 PL: - fixed bug in getproperties, patch by chuckleberryfinn
110# (https://bitbucket.org/decalage/olefileio_pl/issue/7)
111# - added close method to OleFileIO (fixed issue #2)
112# 2012-07-25 v0.23 PL: - added support for file-like objects (patch by mete0r_kr)
113# 2013-05-05 v0.24 PL: - getproperties: added conversion from filetime to python
114# datetime
115# - main: displays properties with date format
116# - new class OleMetadata to parse standard properties
117# - added get_metadata method
118# 2013-05-07 v0.24 PL: - a few improvements in OleMetadata
119# 2013-05-24 v0.25 PL: - getproperties: option to not convert some timestamps
120# - OleMetaData: total_edit_time is now a number of seconds,
121# not a timestamp
122# - getproperties: added support for VT_BOOL, VT_INT, V_UINT
123# - getproperties: filter out null chars from strings
124# - getproperties: raise non-fatal defects instead of
125# exceptions when properties cannot be parsed properly
126# 2013-05-27 PL: - getproperties: improved exception handling
127# - _raise_defect: added option to set exception type
128# - all non-fatal issues are now recorded, and displayed
129# when run as a script
130# 2013-07-11 v0.26 PL: - added methods to get modification and creation times
131# of a directory entry or a storage/stream
132# - fixed parsing of direntry timestamps
133# 2013-07-24 PL: - new options in listdir to list storages and/or streams
134
135#-----------------------------------------------------------------------------
136# TODO (for version 1.0):
137# + add path attrib to _OleDirEntry, set it once and for all in init or
138# append_kids (then listdir/_list can be simplified)
139# - TESTS with Linux, MacOSX, Python 1.5.2, various files, PIL, ...
140# - add underscore to each private method, to avoid their display in
141# pydoc/epydoc documentation - Remove it for classes to be documented
142# - replace all raised exceptions with _raise_defect (at least in OleFileIO)
143# - merge code from _OleStream and OleFileIO.getsect to read sectors
144# (maybe add a class for FAT and MiniFAT ?)
145# - add method to check all streams (follow sectors chains without storing all
146# stream in memory, and report anomalies)
147# - use _OleDirectoryEntry.kids_dict to improve _find and _list ?
148# - fix Unicode names handling (find some way to stay compatible with Py1.5.2)
149# => if possible avoid converting names to Latin-1
150# - review DIFAT code: fix handling of DIFSECT blocks in FAT (not stop)
151# - rewrite OleFileIO.getproperties
152# - improve docstrings to show more sample uses
153# - see also original notes and FIXME below
154# - remove all obsolete FIXMEs
155# - OleMetadata: fix version attrib according to
156# http://msdn.microsoft.com/en-us/library/dd945671%28v=office.12%29.aspx
157
158# IDEAS:
159# - in OleFileIO._open and _OleStream, use size=None instead of 0x7FFFFFFF for
160# streams with unknown size
161# - use arrays of int instead of long integers for FAT/MiniFAT, to improve
162# performance and reduce memory usage ? (possible issue with values >2^31)
163# - provide tests with unittest (may need write support to create samples)
164# - move all debug code (and maybe dump methods) to a separate module, with
165# a class which inherits OleFileIO ?
166# - fix docstrings to follow epydoc format
167# - add support for 4K sectors ?
168# - add support for big endian byte order ?
169# - create a simple OLE explorer with wxPython
170
171# FUTURE EVOLUTIONS to add write support:
172# 1) add ability to write a stream back on disk from StringIO (same size, no
173# change in FAT/MiniFAT).
174# 2) rename a stream/storage if it doesn't change the RB tree
175# 3) use rbtree module to update the red-black tree + any rename
176# 4) remove a stream/storage: free sectors in FAT/MiniFAT
177# 5) allocate new sectors in FAT/MiniFAT
178# 6) create new storage/stream
179#-----------------------------------------------------------------------------
180
181#
182# THIS IS WORK IN PROGRESS
183#
184# The Python Imaging Library
185# $Id: OleFileIO.py 2339 2005-03-25 08:02:17Z fredrik $
186#
187# stuff to deal with OLE2 Structured Storage files. this module is
188# used by PIL to read Image Composer and FlashPix files, but can also
189# be used to read other files of this type.
190#
191# History:
192# 1997-01-20 fl Created
193# 1997-01-22 fl Fixed 64-bit portability quirk
194# 2003-09-09 fl Fixed typo in OleFileIO.loadfat (noted by Daniel Haertle)
195# 2004-02-29 fl Changed long hex constants to signed integers
196#
197# Notes:
198# FIXME: sort out sign problem (eliminate long hex constants)
199# FIXME: change filename to use "a/b/c" instead of ["a", "b", "c"]
200# FIXME: provide a glob mechanism function (using fnmatchcase)
201#
202# Literature:
203#
204# "FlashPix Format Specification, Appendix A", Kodak and Microsoft,
205# September 1996.
206#
207# Quotes:
208#
209# "If this document and functionality of the Software conflict,
210# the actual functionality of the Software represents the correct
211# functionality" -- Microsoft, in the OLE format specification
212#
213# Copyright (c) Secret Labs AB 1997.
214# Copyright (c) Fredrik Lundh 1997.
215#
216# See the README file for information on usage and redistribution.
217#
218
219#------------------------------------------------------------------------------
220
221import string, StringIO, struct, array, os.path, sys, datetime
222
223#[PL] Define explicitly the public API to avoid private objects in pydoc:
224__all__ = ['OleFileIO', 'isOleFile']
225
226#[PL] workaround to fix an issue with array item size on 64 bits systems:
227if array.array('L').itemsize == 4:
228 # on 32 bits platforms, long integers in an array are 32 bits:
229 UINT32 = 'L'
230elif array.array('I').itemsize == 4:
231 # on 64 bits platforms, integers in an array are 32 bits:
232 UINT32 = 'I'
233else:
234 raise ValueError, 'Need to fix a bug with 32 bit arrays, please contact author...'
235
236
237#[PL] These workarounds were inspired from the Path module
238# (see http://www.jorendorff.com/articles/python/path/)
239#TODO: test with old Python versions
240
241# Pre-2.3 workaround for booleans
242try:
243 True, False
244except NameError:
245 True, False = 1, 0
246
247# Pre-2.3 workaround for basestring.
248try:
249 basestring
250except NameError:
251 try:
252 # is Unicode supported (Python >2.0 or >1.6 ?)
253 basestring = (str, unicode)
254 except NameError:
255 basestring = str
256
257#[PL] Experimental setting: if True, OLE filenames will be kept in Unicode
258# if False (default PIL behaviour), all filenames are converted to Latin-1.
259KEEP_UNICODE_NAMES = False
260
261#[PL] DEBUG display mode: False by default, use set_debug_mode() or "-d" on
262# command line to change it.
263DEBUG_MODE = False
264def debug_print(msg):
265 print msg
266def debug_pass(msg):
267 pass
268debug = debug_pass
269
270def set_debug_mode(debug_mode):
271 """
272 Set debug mode on or off, to control display of debugging messages.
273 mode: True or False
274 """
275 global DEBUG_MODE, debug
276 DEBUG_MODE = debug_mode
277 if debug_mode:
278 debug = debug_print
279 else:
280 debug = debug_pass
281
282#TODO: convert this to hex
283MAGIC = '\320\317\021\340\241\261\032\341'
284
285#[PL]: added constants for Sector IDs (from AAF specifications)
286MAXREGSECT = 0xFFFFFFFAL; # maximum SECT
287DIFSECT = 0xFFFFFFFCL; # (-4) denotes a DIFAT sector in a FAT
288FATSECT = 0xFFFFFFFDL; # (-3) denotes a FAT sector in a FAT
289ENDOFCHAIN = 0xFFFFFFFEL; # (-2) end of a virtual stream chain
290FREESECT = 0xFFFFFFFFL; # (-1) unallocated sector
291
292#[PL]: added constants for Directory Entry IDs (from AAF specifications)
293MAXREGSID = 0xFFFFFFFAL; # maximum directory entry ID
294NOSTREAM = 0xFFFFFFFFL; # (-1) unallocated directory entry
295
296#[PL] object types in storage (from AAF specifications)
297STGTY_EMPTY = 0 # empty directory entry (according to OpenOffice.org doc)
298STGTY_STORAGE = 1 # element is a storage object
299STGTY_STREAM = 2 # element is a stream object
300STGTY_LOCKBYTES = 3 # element is an ILockBytes object
301STGTY_PROPERTY = 4 # element is an IPropertyStorage object
302STGTY_ROOT = 5 # element is a root storage
303
304
305#
306# --------------------------------------------------------------------
307# property types
308
309VT_EMPTY=0; VT_NULL=1; VT_I2=2; VT_I4=3; VT_R4=4; VT_R8=5; VT_CY=6;
310VT_DATE=7; VT_BSTR=8; VT_DISPATCH=9; VT_ERROR=10; VT_BOOL=11;
311VT_VARIANT=12; VT_UNKNOWN=13; VT_DECIMAL=14; VT_I1=16; VT_UI1=17;
312VT_UI2=18; VT_UI4=19; VT_I8=20; VT_UI8=21; VT_INT=22; VT_UINT=23;
313VT_VOID=24; VT_HRESULT=25; VT_PTR=26; VT_SAFEARRAY=27; VT_CARRAY=28;
314VT_USERDEFINED=29; VT_LPSTR=30; VT_LPWSTR=31; VT_FILETIME=64;
315VT_BLOB=65; VT_STREAM=66; VT_STORAGE=67; VT_STREAMED_OBJECT=68;
316VT_STORED_OBJECT=69; VT_BLOB_OBJECT=70; VT_CF=71; VT_CLSID=72;
317VT_VECTOR=0x1000;
318
319# map property id to name (for debugging purposes)
320
321VT = {}
322for keyword, var in vars().items():
323 if keyword[:3] == "VT_":
324 VT[var] = keyword
325
326#
327# --------------------------------------------------------------------
328# Some common document types (root.clsid fields)
329
330WORD_CLSID = "00020900-0000-0000-C000-000000000046"
331#TODO: check Excel, PPT, ...
332
333#[PL]: Defect levels to classify parsing errors - see OleFileIO._raise_defect()
334DEFECT_UNSURE = 10 # a case which looks weird, but not sure it's a defect
335DEFECT_POTENTIAL = 20 # a potential defect
336DEFECT_INCORRECT = 30 # an error according to specifications, but parsing
337 # can go on
338DEFECT_FATAL = 40 # an error which cannot be ignored, parsing is
339 # impossible
340
341#[PL] add useful constants to __all__:
342for key in vars().keys():
343 if key.startswith('STGTY_') or key.startswith('DEFECT_'):
344 __all__.append(key)
345
346
347#--- FUNCTIONS ----------------------------------------------------------------
348
349def isOleFile (filename):
350 """
351 Test if file is an OLE container (according to its header).
352 filename: file name or path (str, unicode)
353 return: True if OLE, False otherwise.
354 """
355 f = open(filename, 'rb')
356 header = f.read(len(MAGIC))
357 if header == MAGIC:
358 return True
359 else:
360 return False
361
362
363#TODO: replace i16 and i32 with more readable struct.unpack equivalent
364def i16(c, o = 0):
365 """
366 Converts a 2-bytes (16 bits) string to an integer.
367
368 c: string containing bytes to convert
369 o: offset of bytes to convert in string
370 """
371 return ord(c[o])+(ord(c[o+1])<<8)
372
373
374def i32(c, o = 0):
375 """
376 Converts a 4-bytes (32 bits) string to an integer.
377
378 c: string containing bytes to convert
379 o: offset of bytes to convert in string
380 """
381 return int(ord(c[o])+(ord(c[o+1])<<8)+(ord(c[o+2])<<16)+(ord(c[o+3])<<24))
382 # [PL]: added int() because "<<" gives long int since Python 2.4
383
384
385def _clsid(clsid):
386 """
387 Converts a CLSID to a human-readable string.
388 clsid: string of length 16.
389 """
390 assert len(clsid) == 16
391 if clsid == "\0" * len(clsid):
392 return ""
393 return (("%08X-%04X-%04X-%02X%02X-" + "%02X" * 6) %
394 ((i32(clsid, 0), i16(clsid, 4), i16(clsid, 6)) +
395 tuple(map(ord, clsid[8:16]))))
396
397
398
399# UNICODE support for Old Python versions:
400# (necessary to handle storages/streams names which use Unicode)
401
402try:
403 # is Unicode supported ?
404 unicode
405
406 def _unicode(s, errors='replace'):
407 """
408 Map unicode string to Latin 1. (Python with Unicode support)
409
410 s: UTF-16LE unicode string to convert to Latin-1
411 errors: 'replace', 'ignore' or 'strict'. See Python doc for unicode()
412 """
413 #TODO: test if it OleFileIO works with Unicode strings, instead of
414 # converting to Latin-1.
415 try:
416 # First the string is converted to plain Unicode:
417 # (assuming it is encoded as UTF-16 little-endian)
418 u = s.decode('UTF-16LE', errors)
419 if KEEP_UNICODE_NAMES:
420 return u
421 else:
422 # Second the unicode string is converted to Latin-1
423 return u.encode('latin_1', errors)
424 except:
425 # there was an error during Unicode to Latin-1 conversion:
426 raise IOError ('incorrect Unicode name')
427
428except NameError:
429 def _unicode(s, errors='replace'):
430 """
431 Map unicode string to Latin 1. (Python without native Unicode support)
432
433 s: UTF-16LE unicode string to convert to Latin-1
434 errors: 'replace', 'ignore' or 'strict'. (ignored in this version)
435 """
436 # If the unicode function does not exist, we assume this is an old
437 # Python version without Unicode support.
438 # Null bytes are simply removed (this only works with usual Latin-1
439 # strings which do not contain unicode characters>256):
440 return filter(ord, s)
441
442
443def filetime2datetime(filetime):
444 """
445 convert FILETIME (64 bits int) to Python datetime.datetime
446 """
447 # TODO: manage exception when microseconds is too large
448 # inspired from http://code.activestate.com/recipes/511425-filetime-to-datetime/
449 _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0)
450 #debug('timedelta days=%d' % (filetime/(10*1000000*3600*24)))
451 return _FILETIME_null_date + datetime.timedelta(microseconds=filetime/10)
452
453
454
455#=== CLASSES ==================================================================
456
457class OleMetadata:
458 """
459 class to parse and store metadata from standard properties of OLE files.
460
461 Available attributes:
462 codepage, title, subject, author, keywords, comments, template,
463 last_saved_by, revision_number, total_edit_time, last_printed, create_time,
464 last_saved_time, num_pages, num_words, num_chars, thumbnail,
465 creating_application, security, codepage_doc, category, presentation_target,
466 bytes, lines, paragraphs, slides, notes, hidden_slides, mm_clips,
467 scale_crop, heading_pairs, titles_of_parts, manager, company, links_dirty,
468 chars_with_spaces, unused, shared_doc, link_base, hlinks, hlinks_changed,
469 version, dig_sig, content_type, content_status, language, doc_version
470
471 Note: an attribute is set to None when not present in the properties of the
472 OLE file.
473
474 References for SummaryInformation stream:
475 - http://msdn.microsoft.com/en-us/library/dd942545.aspx
476 - http://msdn.microsoft.com/en-us/library/dd925819%28v=office.12%29.aspx
477 - http://msdn.microsoft.com/en-us/library/windows/desktop/aa380376%28v=vs.85%29.aspx
478 - http://msdn.microsoft.com/en-us/library/aa372045.aspx
479 - http://sedna-soft.de/summary-information-stream/
480 - http://poi.apache.org/apidocs/org/apache/poi/hpsf/SummaryInformation.html
481
482 References for DocumentSummaryInformation stream:
483 - http://msdn.microsoft.com/en-us/library/dd945671%28v=office.12%29.aspx
484 - http://msdn.microsoft.com/en-us/library/windows/desktop/aa380374%28v=vs.85%29.aspx
485 - http://poi.apache.org/apidocs/org/apache/poi/hpsf/DocumentSummaryInformation.html
486
487 new in version 0.25
488 """
489
490 # attribute names for SummaryInformation stream properties:
491 # (ordered by property id, starting at 1)
492 SUMMARY_ATTRIBS = ['codepage', 'title', 'subject', 'author', 'keywords', 'comments',
493 'template', 'last_saved_by', 'revision_number', 'total_edit_time',
494 'last_printed', 'create_time', 'last_saved_time', 'num_pages',
495 'num_words', 'num_chars', 'thumbnail', 'creating_application',
496 'security']
497
498 # attribute names for DocumentSummaryInformation stream properties:
499 # (ordered by property id, starting at 1)
500 DOCSUM_ATTRIBS = ['codepage_doc', 'category', 'presentation_target', 'bytes', 'lines', 'paragraphs',
501 'slides', 'notes', 'hidden_slides', 'mm_clips',
502 'scale_crop', 'heading_pairs', 'titles_of_parts', 'manager',
503 'company', 'links_dirty', 'chars_with_spaces', 'unused', 'shared_doc',
504 'link_base', 'hlinks', 'hlinks_changed', 'version', 'dig_sig',
505 'content_type', 'content_status', 'language', 'doc_version']
506
507 def __init__(self):
508 """
509 Constructor for OleMetadata
510 All attributes are set to None by default
511 """
512 # properties from SummaryInformation stream
513 self.codepage = None
514 self.title = None
515 self.subject = None
516 self.author = None
517 self.keywords = None
518 self.comments = None
519 self.template = None
520 self.last_saved_by = None
521 self.revision_number = None
522 self.total_edit_time = None
523 self.last_printed = None
524 self.create_time = None
525 self.last_saved_time = None
526 self.num_pages = None
527 self.num_words = None
528 self.num_chars = None
529 self.thumbnail = None
530 self.creating_application = None
531 self.security = None
532 # properties from DocumentSummaryInformation stream
533 self.codepage_doc = None
534 self.category = None
535 self.presentation_target = None
536 self.bytes = None
537 self.lines = None
538 self.paragraphs = None
539 self.slides = None
540 self.notes = None
541 self.hidden_slides = None
542 self.mm_clips = None
543 self.scale_crop = None
544 self.heading_pairs = None
545 self.titles_of_parts = None
546 self.manager = None
547 self.company = None
548 self.links_dirty = None
549 self.chars_with_spaces = None
550 self.unused = None
551 self.shared_doc = None
552 self.link_base = None
553 self.hlinks = None
554 self.hlinks_changed = None
555 self.version = None
556 self.dig_sig = None
557 self.content_type = None
558 self.content_status = None
559 self.language = None
560 self.doc_version = None
561
562
563 def parse_properties(self, olefile):
564 """
565 Parse standard properties of an OLE file, from the streams
566 "\x05SummaryInformation" and "\x05DocumentSummaryInformation",
567 if present.
568 Properties are converted to strings, integers or python datetime objects.
569 If a property is not present, its value is set to None.
570 """
571 # first set all attributes to None:
572 for attrib in (self.SUMMARY_ATTRIBS + self.DOCSUM_ATTRIBS):
573 setattr(self, attrib, None)
574 if olefile.exists("\x05SummaryInformation"):
575 # get properties from the stream:
576 # (converting timestamps to python datetime, except total_edit_time,
577 # which is property #10)
578 props = olefile.getproperties("\x05SummaryInformation",
579 convert_time=True, no_conversion=[10])
580 # store them into this object's attributes:
581 for i in range(len(self.SUMMARY_ATTRIBS)):
582 # ids for standards properties start at 0x01, until 0x13
583 value = props.get(i+1, None)
584 setattr(self, self.SUMMARY_ATTRIBS[i], value)
585 if olefile.exists("\x05DocumentSummaryInformation"):
586 # get properties from the stream:
587 props = olefile.getproperties("\x05DocumentSummaryInformation",
588 convert_time=True)
589 # store them into this object's attributes:
590 for i in range(len(self.DOCSUM_ATTRIBS)):
591 # ids for standards properties start at 0x01, until 0x13
592 value = props.get(i+1, None)
593 setattr(self, self.DOCSUM_ATTRIBS[i], value)
594
595 def dump(self):
596 """
597 Dump all metadata, for debugging purposes.
598 """
599 print 'Properties from SummaryInformation stream:'
600 for prop in self.SUMMARY_ATTRIBS:
601 value = getattr(self, prop)
602 print '- %s: %s' % (prop, repr(value))
603 print 'Properties from DocumentSummaryInformation stream:'
604 for prop in self.DOCSUM_ATTRIBS:
605 value = getattr(self, prop)
606 print '- %s: %s' % (prop, repr(value))
607
608
609#--- _OleStream ---------------------------------------------------------------
610
611class _OleStream(StringIO.StringIO):
612 """
613 OLE2 Stream
614
615 Returns a read-only file object which can be used to read
616 the contents of a OLE stream (instance of the StringIO class).
617 To open a stream, use the openstream method in the OleFile class.
618
619 This function can be used with either ordinary streams,
620 or ministreams, depending on the offset, sectorsize, and
621 fat table arguments.
622
623 Attributes:
624 - size: actual size of data stream, after it was opened.
625 """
626
627 # FIXME: should store the list of sects obtained by following
628 # the fat chain, and load new sectors on demand instead of
629 # loading it all in one go.
630
631 def __init__(self, fp, sect, size, offset, sectorsize, fat, filesize):
632 """
633 Constructor for _OleStream class.
634
635 fp : file object, the OLE container or the MiniFAT stream
636 sect : sector index of first sector in the stream
637 size : total size of the stream
638 offset : offset in bytes for the first FAT or MiniFAT sector
639 sectorsize: size of one sector
640 fat : array/list of sector indexes (FAT or MiniFAT)
641 filesize : size of OLE file (for debugging)
642 return : a StringIO instance containing the OLE stream
643 """
644 debug('_OleStream.__init__:')
645 debug(' sect=%d (%X), size=%d, offset=%d, sectorsize=%d, len(fat)=%d, fp=%s'
646 %(sect,sect,size,offset,sectorsize,len(fat), repr(fp)))
647 #[PL] To detect malformed documents with FAT loops, we compute the
648 # expected number of sectors in the stream:
649 unknown_size = False
650 if size==0x7FFFFFFF:
651 # this is the case when called from OleFileIO._open(), and stream
652 # size is not known in advance (for example when reading the
653 # Directory stream). Then we can only guess maximum size:
654 size = len(fat)*sectorsize
655 # and we keep a record that size was unknown:
656 unknown_size = True
657 debug(' stream with UNKNOWN SIZE')
658 nb_sectors = (size + (sectorsize-1)) / sectorsize
659 debug('nb_sectors = %d' % nb_sectors)
660 # This number should (at least) be less than the total number of
661 # sectors in the given FAT:
662 if nb_sectors > len(fat):
663 raise IOError ('malformed OLE document, stream too large')
664 # optimization(?): data is first a list of strings, and join() is called
665 # at the end to concatenate all in one string.
666 # (this may not be really useful with recent Python versions)
667 data = []
668 # if size is zero, then first sector index should be ENDOFCHAIN:
669 if size == 0 and sect != ENDOFCHAIN:
670 debug('size == 0 and sect != ENDOFCHAIN:')
671 raise IOError ('incorrect OLE sector index for empty stream')
672 #[PL] A fixed-length for loop is used instead of an undefined while
673 # loop to avoid DoS attacks:
674 for i in xrange(nb_sectors):
675 # Sector index may be ENDOFCHAIN, but only if size was unknown
676 if sect == ENDOFCHAIN:
677 if unknown_size:
678 break
679 else:
680 # else this means that the stream is smaller than declared:
681 debug('sect=ENDOFCHAIN before expected size')
682 raise IOError ('incomplete OLE stream')
683 # sector index should be within FAT:
684 if sect<0 or sect>=len(fat):
685 debug('sect=%d (%X) / len(fat)=%d' % (sect, sect, len(fat)))
686 debug('i=%d / nb_sectors=%d' %(i, nb_sectors))
687## tmp_data = string.join(data, "")
688## f = open('test_debug.bin', 'wb')
689## f.write(tmp_data)
690## f.close()
691## debug('data read so far: %d bytes' % len(tmp_data))
692 raise IOError('incorrect OLE FAT, sector index out of range')
693 #TODO: merge this code with OleFileIO.getsect() ?
694 #TODO: check if this works with 4K sectors:
695 try:
696 fp.seek(offset + sectorsize * sect)
697 except:
698 debug('sect=%d, seek=%d, filesize=%d' %
699 (sect, offset+sectorsize*sect, filesize))
700 raise IOError(OLE sector index out of range'
701 sector_data = fp.read(sectorsize)
702 # [PL] check if there was enough data:
703 # Note: if sector is the last of the file, sometimes it is not a
704 # complete sector (of 512 or 4K), so we may read less than
705 # sectorsize.
706 if len(sector_data)!=sectorsize and sect!=(len(fat)-1):
707 debug('sect=%d / len(fat)=%d, seek=%d / filesize=%d, len read=%d' %
708 (sect, len(fat), offset+sectorsize*sect, filesize, len(sector_data)))
709 debug('seek+len(read)=%d' % (offset+sectorsize*sect+len(sector_data)))
710 raise IOError ('incomplete OLE sector')
711 data.append(sector_data)
712 # jump to next sector in the FAT:
713 try:
714 sect = fat[sect]
715 except IndexError:
716 # [PL] if pointer is out of the FAT an exception is raised
717 raise IOError('incorrect OLE FAT, sector index out of range'
718 #[PL] Last sector should be a "end of chain" marker:
719 if sect != ENDOFCHAIN:
720 raise IOError ('incorrect last sector index in OLE stream')
721 data = string.join(data, "")
722 # Data is truncated to the actual stream size:
723 if len(data) >= size:
724 data = data[:size]
725 # actual stream size is stored for future use:
726 self.size = size
727 elif unknown_size:
728 # actual stream size was not known, now we know the size of read
729 # data:
730 self.size = len(data)
731 else:
732 # read data is less than expected:
733 debug('len(data)=%d, size=%d' % (len(data), size))
734 raise IOError ('OLE stream size is less than declared')
735 # when all data is read in memory, StringIO constructor is called
736 StringIO.StringIO.__init__(self, data)
737 # Then the _OleStream object can be used as a read-only file object.
738
739
740#--- _OleDirectoryEntry -------------------------------------------------------
741
742class _OleDirectoryEntry:
743
744 """
745 OLE2 Directory Entry
746 """
747 #[PL] parsing code moved from OleFileIO.loaddirectory
748
749 # struct to parse directory entries:
750 # <: little-endian byte order, standard sizes
751 # (note: this should guarantee that Q returns a 64 bits int)
752 # 64s: string containing entry name in unicode (max 31 chars) + null char
753 # H: uint16, number of bytes used in name buffer, including null = (len+1)*2
754 # B: uint8, dir entry type (between 0 and 5)
755 # B: uint8, color: 0=black, 1=red
756 # I: uint32, index of left child node in the red-black tree, NOSTREAM if none
757 # I: uint32, index of right child node in the red-black tree, NOSTREAM if none
758 # I: uint32, index of child root node if it is a storage, else NOSTREAM
759 # 16s: CLSID, unique identifier (only used if it is a storage)
760 # I: uint32, user flags
761 # Q (was 8s): uint64, creation timestamp or zero
762 # Q (was 8s): uint64, modification timestamp or zero
763 # I: uint32, SID of first sector if stream or ministream, SID of 1st sector
764 # of stream containing ministreams if root entry, 0 otherwise
765 # I: uint32, total stream size in bytes if stream (low 32 bits), 0 otherwise
766 # I: uint32, total stream size in bytes if stream (high 32 bits), 0 otherwise
767 STRUCT_DIRENTRY = '<64sHBBIII16sIQQIII'
768 # size of a directory entry: 128 bytes
769 DIRENTRY_SIZE = 128
770 assert struct.calcsize(STRUCT_DIRENTRY) == DIRENTRY_SIZE
771
772
773 def __init__(self, entry, sid, olefile):
774 """
775 Constructor for an _OleDirectoryEntry object.
776 Parses a 128-bytes entry from the OLE Directory stream.
777
778 entry : string (must be 128 bytes long)
779 sid : index of this directory entry in the OLE file directory
780 olefile: OleFileIO containing this directory entry
781 """
782 self.sid = sid
783 # ref to olefile is stored for future use
784 self.olefile = olefile
785 # kids is a list of children entries, if this entry is a storage:
786 # (list of _OleDirectoryEntry objects)
787 self.kids = []
788 # kids_dict is a dictionary of children entries, indexed by their
789 # name in lowercase: used to quickly find an entry, and to detect
790 # duplicates
791 self.kids_dict = {}
792 # flag used to detect if the entry is referenced more than once in
793 # directory:
794 self.used = False
795 # decode DirEntry
796 (
797 name,
798 namelength,
799 self.entry_type,
800 self.color,
801 self.sid_left,
802 self.sid_right,
803 self.sid_child,
804 clsid,
805 self.dwUserFlags,
806 self.createTime,
807 self.modifyTime,
808 self.isectStart,
809 sizeLow,
810 sizeHigh
811 ) = struct.unpack(_OleDirectoryEntry.STRUCT_DIRENTRY, entry)
812 if self.entry_type not in [STGTY_ROOT, STGTY_STORAGE, STGTY_STREAM, STGTY_EMPTY]:
813 olefile._raise_defect(DEFECT_INCORRECT, 'unhandled OLE storage type')
814 # only first directory entry can (and should) be root:
815 if self.entry_type == STGTY_ROOT and sid != 0:
816 olefile._raise_defect(DEFECT_INCORRECT, 'duplicate OLE root entry')
817 if sid == 0 and self.entry_type != STGTY_ROOT:
818 olefile._raise_defect(DEFECT_INCORRECT, 'incorrect OLE root entry')
819 #debug (struct.unpack(fmt_entry, entry[:len_entry]))
820 # name should be at most 31 unicode characters + null character,
821 # so 64 bytes in total (31*2 + 2):
822 if namelength>64:
823 olefile._raise_defect(DEFECT_INCORRECT, 'incorrect DirEntry name length')
824 # if exception not raised, namelength is set to the maximum value:
825 namelength = 64
826 # only characters without ending null char are kept:
827 name = name[:(namelength-2)]
828 # name is converted from unicode to Latin-1:
829 self.name = _unicode(name)
830
831 debug('DirEntry SID=%d: %s' % (self.sid, repr(self.name)))
832 debug(' - type: %d' % self.entry_type)
833 debug(' - sect: %d' % self.isectStart)
834 debug(' - SID left: %d, right: %d, child: %d' % (self.sid_left,
835 self.sid_right, self.sid_child))
836
837 # sizeHigh is only used for 4K sectors, it should be zero for 512 bytes
838 # sectors, BUT apparently some implementations set it as 0xFFFFFFFFL, 1
839 # or some other value so it cannot be raised as a defect in general:
840 if olefile.sectorsize == 512:
841 if sizeHigh != 0 and sizeHigh != 0xFFFFFFFFL:
842 debug('sectorsize=%d, sizeLow=%d, sizeHigh=%d (%X)' %
843 (olefile.sectorsize, sizeLow, sizeHigh, sizeHigh))
844 olefile._raise_defect(DEFECT_UNSURE, 'incorrect OLE stream size')
845 self.size = sizeLow
846 else:
847 self.size = sizeLow + (long(sizeHigh)<<32)
848 debug(' - size: %d (sizeLow=%d, sizeHigh=%d)' % (self.size, sizeLow, sizeHigh))
849
850 self.clsid = _clsid(clsid)
851 # a storage should have a null size, BUT some implementations such as
852 # Word 8 for Mac seem to allow non-null values => Potential defect:
853 if self.entry_type == STGTY_STORAGE and self.size != 0:
854 olefile._raise_defect(DEFECT_POTENTIAL, 'OLE storage with size>0')
855 # check if stream is not already referenced elsewhere:
856 if self.entry_type in (STGTY_ROOT, STGTY_STREAM) and self.size>0:
857 if self.size < olefile.minisectorcutoff \
858 and self.entry_type==STGTY_STREAM: # only streams can be in MiniFAT
859 # ministream object
860 minifat = True
861 else:
862 minifat = False
863 olefile._check_duplicate_stream(self.isectStart, minifat)
864
865
866
867 def build_storage_tree(self):
868 """
869 Read and build the red-black tree attached to this _OleDirectoryEntry
870 object, if it is a storage.
871 Note that this method builds a tree of all subentries, so it should
872 only be called for the root object once.
873 """
874 debug('build_storage_tree: SID=%d - %s - sid_child=%d'
875 % (self.sid, repr(self.name), self.sid_child))
876 if self.sid_child != NOSTREAM:
877 # if child SID is not NOSTREAM, then this entry is a storage.
878 # Let's walk through the tree of children to fill the kids list:
879 self.append_kids(self.sid_child)
880
881 # Note from OpenOffice documentation: the safest way is to
882 # recreate the tree because some implementations may store broken
883 # red-black trees...
884
885 # in the OLE file, entries are sorted on (length, name).
886 # for convenience, we sort them on name instead:
887 # (see __cmp__ method in this class)
888 self.kids.sort()
889
890
891 def append_kids(self, child_sid):
892 """
893 Walk through red-black tree of children of this directory entry to add
894 all of them to the kids list. (recursive method)
895
896 child_sid : index of child directory entry to use, or None when called
897 first time for the root. (only used during recursion)
898 """
899 #[PL] this method was added to use simple recursion instead of a complex
900 # algorithm.
901 # if this is not a storage or a leaf of the tree, nothing to do:
902 if child_sid == NOSTREAM:
903 return
904 # check if child SID is in the proper range:
905 if child_sid<0 or child_sid>=len(self.olefile.direntries):
906 self.olefile._raise_defect(DEFECT_FATAL, 'OLE DirEntry index out of range')
907 # get child direntry:
908 child = self.olefile._load_direntry(child_sid) #direntries[child_sid]
909 debug('append_kids: child_sid=%d - %s - sid_left=%d, sid_right=%d, sid_child=%d'
910 % (child.sid, repr(child.name), child.sid_left, child.sid_right, child.sid_child))
911 # the directory entries are organized as a red-black tree.
912 # (cf. Wikipedia for details)
913 # First walk through left side of the tree:
914 self.append_kids(child.sid_left)
915 # Check if its name is not already used (case-insensitive):
916 name_lower = child.name.lower()
917 if self.kids_dict.has_key(name_lower):
918 self.olefile._raise_defect(DEFECT_INCORRECT,
919 "Duplicate filename in OLE storage")
920 # Then the child_sid _OleDirectoryEntry object is appended to the
921 # kids list and dictionary:
922 self.kids.append(child)
923 self.kids_dict[name_lower] = child
924 # Check if kid was not already referenced in a storage:
925 if child.used:
926 self.olefile._raise_defect(DEFECT_INCORRECT,
927 'OLE Entry referenced more than once')
928 child.used = True
929 # Finally walk through right side of the tree:
930 self.append_kids(child.sid_right)
931 # Afterwards build kid's own tree if it's also a storage:
932 child.build_storage_tree()
933
934
935 def __cmp__(self, other):
936 "Compare entries by name"
937 return cmp(self.name, other.name)
938 #TODO: replace by the same function as MS implementation ?
939 # (order by name length first, then case-insensitive order)
940
941
942 def dump(self, tab = 0):
943 "Dump this entry, and all its subentries (for debug purposes only)"
944 TYPES = ["(invalid)", "(storage)", "(stream)", "(lockbytes)",
945 "(property)", "(root)"]
946 print " "*tab + repr(self.name), TYPES[self.entry_type],
947 if self.entry_type in (STGTY_STREAM, STGTY_ROOT):
948 print self.size, "bytes",
949 print
950 if self.entry_type in (STGTY_STORAGE, STGTY_ROOT) and self.clsid:
951 print " "*tab + "{%s}" % self.clsid
952
953 for kid in self.kids:
954 kid.dump(tab + 2)
955
956
957 def getmtime(self):
958 """
959 Return modification time of a directory entry.
960
961 return: None if modification time is null, a python datetime object
962 otherwise (UTC timezone)
963
964 new in version 0.26
965 """
966 if self.modifyTime == 0:
967 return None
968 return filetime2datetime(self.modifyTime)
969
970
971 def getctime(self):
972 """
973 Return creation time of a directory entry.
974
975 return: None if modification time is null, a python datetime object
976 otherwise (UTC timezone)
977
978 new in version 0.26
979 """
980 if self.createTime == 0:
981 return None
982 return filetime2datetime(self.createTime)
983
984
985#--- OleFileIO ----------------------------------------------------------------
986
987class OleFileIO:
988 """
989 OLE container object
990
991 This class encapsulates the interface to an OLE 2 structured
992 storage file. Use the {@link listdir} and {@link openstream} methods to
993 access the contents of this file.
994
995 Object names are given as a list of strings, one for each subentry
996 level. The root entry should be omitted. For example, the following
997 code extracts all image streams from a Microsoft Image Composer file:
998
999 ole = OleFileIO("fan.mic")
1000
1001 for entry in ole.listdir():
1002 if entry[1:2] == "Image":
1003 fin = ole.openstream(entry)
1004 fout = open(entry[0:1], "wb")
1005 while 1:
1006 s = fin.read(8192)
1007 if not s:
1008 break
1009 fout.write(s)
1010
1011 You can use the viewer application provided with the Python Imaging
1012 Library to view the resulting files (which happens to be standard
1013 TIFF files).
1014 """
1015
1016 def __init__(self, filename = None, raise_defects=DEFECT_FATAL):
1017 """
1018 Constructor for OleFileIO class.
1019
1020 filename: file to open.
1021 raise_defects: minimal level for defects to be raised as exceptions.
1022 (use DEFECT_FATAL for a typical application, DEFECT_INCORRECT for a
1023 security-oriented application, see source code for details)
1024 """
1025 # minimal level for defects to be raised as exceptions:
1026 self._raise_defects_level = raise_defects
1027 # list of defects/issues not raised as exceptions:
1028 # tuples of (exception type, message)
1029 self.parsing_issues = []
1030 if filename:
1031 self.open(filename)
1032
1033
1034 def _raise_defect(self, defect_level, message, exception_type=IOError):
1035 """
1036 This method should be called for any defect found during file parsing.
1037 It may raise an IOError exception according to the minimal level chosen
1038 for the OleFileIO object.
1039
1040 defect_level: defect level, possible values are:
1041 DEFECT_UNSURE : a case which looks weird, but not sure it's a defect
1042 DEFECT_POTENTIAL : a potential defect
1043 DEFECT_INCORRECT : an error according to specifications, but parsing can go on
1044 DEFECT_FATAL : an error which cannot be ignored, parsing is impossible
1045 message: string describing the defect, used with raised exception.
1046 exception_type: exception class to be raised, IOError by default
1047 """
1048 # added by [PL]
1049 if defect_level >= self._raise_defects_level:
1050 raise exception_type, message
1051 else:
1052 # just record the issue, no exception raised:
1053 self.parsing_issues.append((exception_type, message))
1054
1055
1056 def open(self, filename):
1057 """
1058 Open an OLE2 file.
1059 Reads the header, FAT and directory.
1060
1061 filename: string-like or file-like object
1062 """
1063 #[PL] check if filename is a string-like or file-like object:
1064 # (it is better to check for a read() method)
1065 if hasattr(filename, 'read'):
1066 # file-like object
1067 self.fp = filename
1068 else:
1069 # string-like object: filename of file on disk
1070 #TODO: if larger than 1024 bytes, this could be the actual data => StringIO
1071 self.fp = open(filename, "rb")
1072 # old code fails if filename is not a plain string:
1073 #if type(filename) == type(""):
1074 # self.fp = open(filename, "rb")
1075 #else:
1076 # self.fp = filename
1077 # obtain the filesize by using seek and tell, which should work on most
1078 # file-like objects:
1079 #TODO: do it above, using getsize with filename when possible?
1080 #TODO: fix code to fail with clear exception when filesize cannot be obtained
1081 self.fp.seek(0, os.SEEK_END)
1082 try:
1083 filesize = self.fp.tell()
1084 finally:
1085 self.fp.seek(0)
1086 self._filesize = filesize
1087
1088 # lists of streams in FAT and MiniFAT, to detect duplicate references
1089 # (list of indexes of first sectors of each stream)
1090 self._used_streams_fat = []
1091 self._used_streams_minifat = []
1092
1093 header = self.fp.read(512)
1094
1095 if len(header) != 512 or header[:8] != MAGIC:
1096 self._raise_defect(DEFECT_FATAL, "not an OLE2 structured storage file")
1097
1098 # [PL] header structure according to AAF specifications:
1099 ##Header
1100 ##struct StructuredStorageHeader { // [offset from start (bytes), length (bytes)]
1101 ##BYTE _abSig[8]; // [00H,08] {0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1,
1102 ## // 0x1a, 0xe1} for current version
1103 ##CLSID _clsid; // [08H,16] reserved must be zero (WriteClassStg/
1104 ## // GetClassFile uses root directory class id)
1105 ##USHORT _uMinorVersion; // [18H,02] minor version of the format: 33 is
1106 ## // written by reference implementation
1107 ##USHORT _uDllVersion; // [1AH,02] major version of the dll/format: 3 for
1108 ## // 512-byte sectors, 4 for 4 KB sectors
1109 ##USHORT _uByteOrder; // [1CH,02] 0xFFFE: indicates Intel byte-ordering
1110 ##USHORT _uSectorShift; // [1EH,02] size of sectors in power-of-two;
1111 ## // typically 9 indicating 512-byte sectors
1112 ##USHORT _uMiniSectorShift; // [20H,02] size of mini-sectors in power-of-two;
1113 ## // typically 6 indicating 64-byte mini-sectors
1114 ##USHORT _usReserved; // [22H,02] reserved, must be zero
1115 ##ULONG _ulReserved1; // [24H,04] reserved, must be zero
1116 ##FSINDEX _csectDir; // [28H,04] must be zero for 512-byte sectors,
1117 ## // number of SECTs in directory chain for 4 KB
1118 ## // sectors
1119 ##FSINDEX _csectFat; // [2CH,04] number of SECTs in the FAT chain
1120 ##SECT _sectDirStart; // [30H,04] first SECT in the directory chain
1121 ##DFSIGNATURE _signature; // [34H,04] signature used for transactions; must
1122 ## // be zero. The reference implementation
1123 ## // does not support transactions
1124 ##ULONG _ulMiniSectorCutoff; // [38H,04] maximum size for a mini stream;
1125 ## // typically 4096 bytes
1126 ##SECT _sectMiniFatStart; // [3CH,04] first SECT in the MiniFAT chain
1127 ##FSINDEX _csectMiniFat; // [40H,04] number of SECTs in the MiniFAT chain
1128 ##SECT _sectDifStart; // [44H,04] first SECT in the DIFAT chain
1129 ##FSINDEX _csectDif; // [48H,04] number of SECTs in the DIFAT chain
1130 ##SECT _sectFat[109]; // [4CH,436] the SECTs of first 109 FAT sectors
1131 ##};
1132
1133 # [PL] header decoding:
1134 # '<' indicates little-endian byte ordering for Intel (cf. struct module help)
1135 fmt_header = '<8s16sHHHHHHLLLLLLLLLL'
1136 header_size = struct.calcsize(fmt_header)
1137 debug( "fmt_header size = %d, +FAT = %d" % (header_size, header_size + 109*4) )
1138 header1 = header[:header_size]
1139 (
1140 self.Sig,
1141 self.clsid,
1142 self.MinorVersion,
1143 self.DllVersion,
1144 self.ByteOrder,
1145 self.SectorShift,
1146 self.MiniSectorShift,
1147 self.Reserved, self.Reserved1,
1148 self.csectDir,
1149 self.csectFat,
1150 self.sectDirStart,
1151 self.signature,
1152 self.MiniSectorCutoff,
1153 self.MiniFatStart,
1154 self.csectMiniFat,
1155 self.sectDifStart,
1156 self.csectDif
1157 ) = struct.unpack(fmt_header, header1)
1158 debug( struct.unpack(fmt_header, header1))
1159
1160 if self.Sig != '\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1':
1161 # OLE signature should always be present
1162 self._raise_defect(DEFECT_FATAL, "incorrect OLE signature")
1163 if self.clsid != '\x00'*16:
1164 # according to AAF specs, CLSID should always be zero
1165 self._raise_defect(DEFECT_INCORRECT, "incorrect CLSID in OLE header")
1166 debug( "MinorVersion = %d" % self.MinorVersion )
1167 debug( "DllVersion = %d" % self.DllVersion )
1168 if self.DllVersion not in [3, 4]:
1169 # version 3: usual format, 512 bytes per sector
1170 # version 4: large format, 4K per sector
1171 self._raise_defect(DEFECT_INCORRECT, "incorrect DllVersion in OLE header")
1172 debug( "ByteOrder = %X" % self.ByteOrder )
1173 if self.ByteOrder != 0xFFFE:
1174 # For now only common little-endian documents are handled correctly
1175 self._raise_defect(DEFECT_FATAL, "incorrect ByteOrder in OLE header")
1176 # TODO: add big-endian support for documents created on Mac ?
1177 self.SectorSize = 2**self.SectorShift
1178 debug( "SectorSize = %d" % self.SectorSize )
1179 if self.SectorSize not in [512, 4096]:
1180 self._raise_defect(DEFECT_INCORRECT, "incorrect SectorSize in OLE header")
1181 if (self.DllVersion==3 and self.SectorSize!=512) \
1182 or (self.DllVersion==4 and self.SectorSize!=4096):
1183 self._raise_defect(DEFECT_INCORRECT, "SectorSize does not match DllVersion in OLE header")
1184 self.MiniSectorSize = 2**self.MiniSectorShift
1185 debug( "MiniSectorSize = %d" % self.MiniSectorSize )
1186 if self.MiniSectorSize not in [64]:
1187 self._raise_defect(DEFECT_INCORRECT, "incorrect MiniSectorSize in OLE header")
1188 if self.Reserved != 0 or self.Reserved1 != 0:
1189 self._raise_defect(DEFECT_INCORRECT, "incorrect OLE header (non-null reserved bytes)")
1190 debug( "csectDir = %d" % self.csectDir )
1191 if self.SectorSize==512 and self.csectDir!=0:
1192 self._raise_defect(DEFECT_INCORRECT, "incorrect csectDir in OLE header")
1193 debug( "csectFat = %d" % self.csectFat )
1194 debug( "sectDirStart = %X" % self.sectDirStart )
1195 debug( "signature = %d" % self.signature )
1196 # Signature should be zero, BUT some implementations do not follow this
1197 # rule => only a potential defect:
1198 if self.signature != 0:
1199 self._raise_defect(DEFECT_POTENTIAL, "incorrect OLE header (signature>0)")
1200 debug( "MiniSectorCutoff = %d" % self.MiniSectorCutoff )
1201 debug( "MiniFatStart = %X" % self.MiniFatStart )
1202 debug( "csectMiniFat = %d" % self.csectMiniFat )
1203 debug( "sectDifStart = %X" % self.sectDifStart )
1204 debug( "csectDif = %d" % self.csectDif )
1205
1206 # calculate the number of sectors in the file
1207 # (-1 because header doesn't count)
1208 self.nb_sect = ( (filesize + self.SectorSize-1) / self.SectorSize) - 1
1209 debug( "Number of sectors in the file: %d" % self.nb_sect )
1210
1211 # file clsid (probably never used, so we don't store it)
1212 clsid = _clsid(header[8:24])
1213 self.sectorsize = self.SectorSize #1 << i16(header, 30)
1214 self.minisectorsize = self.MiniSectorSize #1 << i16(header, 32)
1215 self.minisectorcutoff = self.MiniSectorCutoff # i32(header, 56)
1216
1217 # check known streams for duplicate references (these are always in FAT,
1218 # never in MiniFAT):
1219 self._check_duplicate_stream(self.sectDirStart)
1220 # check MiniFAT only if it is not empty:
1221 if self.csectMiniFat:
1222 self._check_duplicate_stream(self.MiniFatStart)
1223 # check DIFAT only if it is not empty:
1224 if self.csectDif:
1225 self._check_duplicate_stream(self.sectDifStart)
1226
1227 # Load file allocation tables
1228 self.loadfat(header)
1229 # Load direcory. This sets both the direntries list (ordered by sid)
1230 # and the root (ordered by hierarchy) members.
1231 self.loaddirectory(self.sectDirStart)#i32(header, 48))
1232 self.ministream = None
1233 self.minifatsect = self.MiniFatStart #i32(header, 60)
1234
1235
1236 def close(self):
1237 """
1238 close the OLE file, to release the file object
1239 """
1240 self.fp.close()
1241
1242
1243 def _check_duplicate_stream(self, first_sect, minifat=False):
1244 """
1245 Checks if a stream has not been already referenced elsewhere.
1246 This method should only be called once for each known stream, and only
1247 if stream size is not null.
1248 first_sect: index of first sector of the stream in FAT
1249 minifat: if True, stream is located in the MiniFAT, else in the FAT
1250 """
1251 if minifat:
1252 debug('_check_duplicate_stream: sect=%d in MiniFAT' % first_sect)
1253 used_streams = self._used_streams_minifat
1254 else:
1255 debug('_check_duplicate_stream: sect=%d in FAT' % first_sect)
1256 # some values can be safely ignored (not a real stream):
1257 if first_sect in (DIFSECT,FATSECT,ENDOFCHAIN,FREESECT):
1258 return
1259 used_streams = self._used_streams_fat
1260 #TODO: would it be more efficient using a dict or hash values, instead
1261 # of a list of long ?
1262 if first_sect in used_streams:
1263 self._raise_defect(DEFECT_INCORRECT, 'Stream referenced twice')
1264 else:
1265 used_streams.append(first_sect)
1266
1267
1268 def dumpfat(self, fat, firstindex=0):
1269 "Displays a part of FAT in human-readable form for debugging purpose"
1270 # [PL] added only for debug
1271 if not DEBUG_MODE:
1272 return
1273 # dictionary to convert special FAT values in human-readable strings
1274 VPL=8 # valeurs par ligne (8+1 * 8+1 = 81)
1275 fatnames = {
1276 FREESECT: "..free..",
1277 ENDOFCHAIN: "[ END. ]",
1278 FATSECT: "FATSECT ",
1279 DIFSECT: "DIFSECT "
1280 }
1281 nbsect = len(fat)
1282 nlines = (nbsect+VPL-1)/VPL
1283 print "index",
1284 for i in range(VPL):
1285 print ("%8X" % i),
1286 print ""
1287 for l in range(nlines):
1288 index = l*VPL
1289 print ("%8X:" % (firstindex+index)),
1290 for i in range(index, index+VPL):
1291 if i>=nbsect:
1292 break
1293 sect = fat[i]
1294 if sect in fatnames:
1295 nom = fatnames[sect]
1296 else:
1297 if sect == i+1:
1298 nom = " --->"
1299 else:
1300 nom = "%8X" % sect
1301 print nom,
1302 print ""
1303
1304
1305 def dumpsect(self, sector, firstindex=0):
1306 "Displays a sector in a human-readable form, for debugging purpose."
1307 if not DEBUG_MODE:
1308 return
1309 VPL=8 # number of values per line (8+1 * 8+1 = 81)
1310 tab = array.array(UINT32, sector)
1311 nbsect = len(tab)
1312 nlines = (nbsect+VPL-1)/VPL
1313 print "index",
1314 for i in range(VPL):
1315 print ("%8X" % i),
1316 print ""
1317 for l in range(nlines):
1318 index = l*VPL
1319 print ("%8X:" % (firstindex+index)),
1320 for i in range(index, index+VPL):
1321 if i>=nbsect:
1322 break
1323 sect = tab[i]
1324 nom = "%8X" % sect
1325 print nom,
1326 print ""
1327
1328 def sect2array(self, sect):
1329 """
1330 convert a sector to an array of 32 bits unsigned integers,
1331 swapping bytes on big endian CPUs such as PowerPC (old Macs)
1332 """
1333 a = array.array(UINT32, sect)
1334 # if CPU is big endian, swap bytes:
1335 if sys.byteorder == 'big':
1336 a.byteswap()
1337 return a
1338
1339
1340 def loadfat_sect(self, sect):
1341 """
1342 Adds the indexes of the given sector to the FAT
1343 sect: string containing the first FAT sector, or array of long integers
1344 return: index of last FAT sector.
1345 """
1346 # a FAT sector is an array of ulong integers.
1347 if isinstance(sect, array.array):
1348 # if sect is already an array it is directly used
1349 fat1 = sect
1350 else:
1351 # if it's a raw sector, it is parsed in an array
1352 fat1 = self.sect2array(sect)
1353 self.dumpsect(sect)
1354 # The FAT is a sector chain starting at the first index of itself.
1355 for isect in fat1:
1356 #print "isect = %X" % isect
1357 if isect == ENDOFCHAIN or isect == FREESECT:
1358 # the end of the sector chain has been reached
1359 break
1360 # read the FAT sector
1361 s = self.getsect(isect)
1362 # parse it as an array of 32 bits integers, and add it to the
1363 # global FAT array
1364 nextfat = self.sect2array(s)
1365 self.fat = self.fat + nextfat
1366 return isect
1367
1368
1369 def loadfat(self, header):
1370 """
1371 Load the FAT table.
1372 """
1373 # The header contains a sector numbers
1374 # for the first 109 FAT sectors. Additional sectors are
1375 # described by DIF blocks
1376
1377 sect = header[76:512]
1378 debug( "len(sect)=%d, so %d integers" % (len(sect), len(sect)/4) )
1379 #fat = []
1380 # [PL] FAT is an array of 32 bits unsigned ints, it's more effective
1381 # to use an array than a list in Python.
1382 # It's initialized as empty first:
1383 self.fat = array.array(UINT32)
1384 self.loadfat_sect(sect)
1385 #self.dumpfat(self.fat)
1386## for i in range(0, len(sect), 4):
1387## ix = i32(sect, i)
1388## #[PL] if ix == -2 or ix == -1: # ix == 0xFFFFFFFEL or ix == 0xFFFFFFFFL:
1389## if ix == 0xFFFFFFFEL or ix == 0xFFFFFFFFL:
1390## break
1391## s = self.getsect(ix)
1392## #fat = fat + map(lambda i, s=s: i32(s, i), range(0, len(s), 4))
1393## fat = fat + array.array(UINT32, s)
1394 if self.csectDif != 0:
1395 # [PL] There's a DIFAT because file is larger than 6.8MB
1396 # some checks just in case:
1397 if self.csectFat <= 109:
1398 # there must be at least 109 blocks in header and the rest in
1399 # DIFAT, so number of sectors must be >109.
1400 self._raise_defect(DEFECT_INCORRECT, 'incorrect DIFAT, not enough sectors')
1401 if self.sectDifStart >= self.nb_sect:
1402 # initial DIFAT block index must be valid
1403 self._raise_defect(DEFECT_FATAL, 'incorrect DIFAT, first index out of range')
1404 debug( "DIFAT analysis..." )
1405 # We compute the necessary number of DIFAT sectors :
1406 # (each DIFAT sector = 127 pointers + 1 towards next DIFAT sector)
1407 nb_difat = (self.csectFat-109 + 126)/127
1408 debug( "nb_difat = %d" % nb_difat )
1409 if self.csectDif != nb_difat:
1410 raise IOError, 'incorrect DIFAT'
1411 isect_difat = self.sectDifStart
1412 for i in xrange(nb_difat):
1413 debug( "DIFAT block %d, sector %X" % (i, isect_difat) )
1414 #TODO: check if corresponding FAT SID = DIFSECT
1415 sector_difat = self.getsect(isect_difat)
1416 difat = self.sect2array(sector_difat)
1417 self.dumpsect(sector_difat)
1418 self.loadfat_sect(difat[:127])
1419 # last DIFAT pointer is next DIFAT sector:
1420 isect_difat = difat[127]
1421 debug( "next DIFAT sector: %X" % isect_difat )
1422 # checks:
1423 if isect_difat not in [ENDOFCHAIN, FREESECT]:
1424 # last DIFAT pointer value must be ENDOFCHAIN or FREESECT
1425 raise IOError, 'incorrect end of DIFAT'
1426## if len(self.fat) != self.csectFat:
1427## # FAT should contain csectFat blocks
1428## print "FAT length: %d instead of %d" % (len(self.fat), self.csectFat)
1429## raise IOError, 'incorrect DIFAT'
1430 # since FAT is read from fixed-size sectors, it may contain more values
1431 # than the actual number of sectors in the file.
1432 # Keep only the relevant sector indexes:
1433 if len(self.fat) > self.nb_sect:
1434 debug('len(fat)=%d, shrunk to nb_sect=%d' % (len(self.fat), self.nb_sect))
1435 self.fat = self.fat[:self.nb_sect]
1436 debug('\nFAT:')
1437 self.dumpfat(self.fat)
1438
1439
1440 def loadminifat(self):
1441 """
1442 Load the MiniFAT table.
1443 """
1444 # MiniFAT is stored in a standard sub-stream, pointed to by a header
1445 # field.
1446 # NOTE: there are two sizes to take into account for this stream:
1447 # 1) Stream size is calculated according to the number of sectors
1448 # declared in the OLE header. This allocated stream may be more than
1449 # needed to store the actual sector indexes.
1450 # (self.csectMiniFat is the number of sectors of size self.SectorSize)
1451 stream_size = self.csectMiniFat * self.SectorSize
1452 # 2) Actually used size is calculated by dividing the MiniStream size
1453 # (given by root entry size) by the size of mini sectors, *4 for
1454 # 32 bits indexes:
1455 nb_minisectors = (self.root.size + self.MiniSectorSize-1) / self.MiniSectorSize
1456 used_size = nb_minisectors * 4
1457 debug('loadminifat(): minifatsect=%d, nb FAT sectors=%d, used_size=%d, stream_size=%d, nb MiniSectors=%d' %
1458 (self.minifatsect, self.csectMiniFat, used_size, stream_size, nb_minisectors))
1459 if used_size > stream_size:
1460 # This is not really a problem, but may indicate a wrong implementation:
1461 self._raise_defect(DEFECT_INCORRECT, 'OLE MiniStream is larger than MiniFAT')
1462 # In any case, first read stream_size:
1463 s = self._open(self.minifatsect, stream_size, force_FAT=True).read()
1464 #[PL] Old code replaced by an array:
1465 #self.minifat = map(lambda i, s=s: i32(s, i), range(0, len(s), 4))
1466 self.minifat = self.sect2array(s)
1467 # Then shrink the array to used size, to avoid indexes out of MiniStream:
1468 debug('MiniFAT shrunk from %d to %d sectors' % (len(self.minifat), nb_minisectors))
1469 self.minifat = self.minifat[:nb_minisectors]
1470 debug('loadminifat(): len=%d' % len(self.minifat))
1471 debug('\nMiniFAT:')
1472 self.dumpfat(self.minifat)
1473
1474 def getsect(self, sect):
1475 """
1476 Read given sector from file on disk.
1477 sect: sector index
1478 returns a string containing the sector data.
1479 """
1480 # [PL] this original code was wrong when sectors are 4KB instead of
1481 # 512 bytes:
1482 #self.fp.seek(512 + self.sectorsize * sect)
1483 #[PL]: added safety checks:
1484 #print "getsect(%X)" % sect
1485 try:
1486 self.fp.seek(self.sectorsize * (sect+1))
1487 except:
1488 debug('getsect(): sect=%X, seek=%d, filesize=%d' %
1489 (sect, self.sectorsize*(sect+1), self._filesize))
1490 self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range')
1491 sector = self.fp.read(self.sectorsize)
1492 if len(sector) != self.sectorsize:
1493 debug('getsect(): sect=%X, read=%d, sectorsize=%d' %
1494 (sect, len(sector), self.sectorsize))
1495 self._raise_defect(DEFECT_FATAL, 'incomplete OLE sector')
1496 return sector
1497
1498
1499 def loaddirectory(self, sect):
1500 """
1501 Load the directory.
1502 sect: sector index of directory stream.
1503 """
1504 # The directory is stored in a standard
1505 # substream, independent of its size.
1506
1507 # open directory stream as a read-only file:
1508 # (stream size is not known in advance)
1509 self.directory_fp = self._open(sect)
1510
1511 #[PL] to detect malformed documents and avoid DoS attacks, the maximum
1512 # number of directory entries can be calculated:
1513 max_entries = self.directory_fp.size / 128
1514 debug('loaddirectory: size=%d, max_entries=%d' %
1515 (self.directory_fp.size, max_entries))
1516
1517 # Create list of directory entries
1518 #self.direntries = []
1519 # We start with a list of "None" object
1520 self.direntries = [None] * max_entries
1521## for sid in xrange(max_entries):
1522## entry = fp.read(128)
1523## if not entry:
1524## break
1525## self.direntries.append(_OleDirectoryEntry(entry, sid, self))
1526 # load root entry:
1527 root_entry = self._load_direntry(0)
1528 # Root entry is the first entry:
1529 self.root = self.direntries[0]
1530 # read and build all storage trees, starting from the root:
1531 self.root.build_storage_tree()
1532
1533
1534 def _load_direntry (self, sid):
1535 """
1536 Load a directory entry from the directory.
1537 This method should only be called once for each storage/stream when
1538 loading the directory.
1539 sid: index of storage/stream in the directory.
1540 return: a _OleDirectoryEntry object
1541 raise: IOError if the entry has always been referenced.
1542 """
1543 # check if SID is OK:
1544 if sid<0 or sid>=len(self.direntries):
1545 self._raise_defect(DEFECT_FATAL, "OLE directory index out of range")
1546 # check if entry was already referenced:
1547 if self.direntries[sid] is not None:
1548 self._raise_defect(DEFECT_INCORRECT,
1549 "double reference for OLE stream/storage")
1550 # if exception not raised, return the object
1551 return self.direntries[sid]
1552 self.directory_fp.seek(sid * 128)
1553 entry = self.directory_fp.read(128)
1554 self.direntries[sid] = _OleDirectoryEntry(entry, sid, self)
1555 return self.direntries[sid]
1556
1557
1558 def dumpdirectory(self):
1559 """
1560 Dump directory (for debugging only)
1561 """
1562 self.root.dump()
1563
1564
1565 def _open(self, start, size = 0x7FFFFFFF, force_FAT=False):
1566 """
1567 Open a stream, either in FAT or MiniFAT according to its size.
1568 (openstream helper)
1569
1570 start: index of first sector
1571 size: size of stream (or nothing if size is unknown)
1572 force_FAT: if False (default), stream will be opened in FAT or MiniFAT
1573 according to size. If True, it will always be opened in FAT.
1574 """
1575 debug('OleFileIO.open(): sect=%d, size=%d, force_FAT=%s' %
1576 (start, size, str(force_FAT)))
1577 # stream size is compared to the MiniSectorCutoff threshold:
1578 if size < self.minisectorcutoff and not force_FAT:
1579 # ministream object
1580 if not self.ministream:
1581 # load MiniFAT if it wasn't already done:
1582 self.loadminifat()
1583 # The first sector index of the miniFAT stream is stored in the
1584 # root directory entry:
1585 size_ministream = self.root.size
1586 debug('Opening MiniStream: sect=%d, size=%d' %
1587 (self.root.isectStart, size_ministream))
1588 self.ministream = self._open(self.root.isectStart,
1589 size_ministream, force_FAT=True)
1590 return _OleStream(self.ministream, start, size, 0,
1591 self.minisectorsize, self.minifat,
1592 self.ministream.size)
1593 else:
1594 # standard stream
1595 return _OleStream(self.fp, start, size, 512,
1596 self.sectorsize, self.fat, self._filesize)
1597
1598
1599 def _list(self, files, prefix, node, streams=True, storages=False):
1600 """
1601 (listdir helper)
1602 files: list of files to fill in
1603 prefix: current location in storage tree (list of names)
1604 node: current node (_OleDirectoryEntry object)
1605 streams: bool, include streams if True (True by default) - new in v0.26
1606 storages: bool, include storages if True (False by default) - new in v0.26
1607 (note: the root storage is never included)
1608 """
1609 prefix = prefix + [node.name]
1610 for entry in node.kids:
1611 if entry.kids:
1612 # this is a storage
1613 if storages:
1614 # add it to the list
1615 files.append(prefix[1:] + [entry.name])
1616 # check its kids
1617 self._list(files, prefix, entry, streams, storages)
1618 else:
1619 # this is a stream
1620 if streams:
1621 # add it to the list
1622 files.append(prefix[1:] + [entry.name])
1623
1624
1625 def listdir(self, streams=True, storages=False):
1626 """
1627 Return a list of streams stored in this file
1628
1629 streams: bool, include streams if True (True by default) - new in v0.26
1630 storages: bool, include storages if True (False by default) - new in v0.26
1631 (note: the root storage is never included)
1632 """
1633 files = []
1634 self._list(files, [], self.root, streams, storages)
1635 return files
1636
1637
1638 def _find(self, filename):
1639 """
1640 Returns directory entry of given filename. (openstream helper)
1641 Note: this method is case-insensitive.
1642
1643 filename: path of stream in storage tree (except root entry), either:
1644 - a string using Unix path syntax, for example:
1645 'storage_1/storage_1.2/stream'
1646 - a list of storage filenames, path to the desired stream/storage.
1647 Example: ['storage_1', 'storage_1.2', 'stream']
1648 return: sid of requested filename
1649 raise IOError if file not found
1650 """
1651
1652 # if filename is a string instead of a list, split it on slashes to
1653 # convert to a list:
1654 if isinstance(filename, basestring):
1655 filename = filename.split('/')
1656 # walk across storage tree, following given path:
1657 node = self.root
1658 for name in filename:
1659 for kid in node.kids:
1660 if kid.name.lower() == name.lower():
1661 break
1662 else:
1663 raise IOError, "file not found"
1664 node = kid
1665 return node.sid
1666
1667
1668 def openstream(self, filename):
1669 """
1670 Open a stream as a read-only file object (StringIO).
1671
1672 filename: path of stream in storage tree (except root entry), either:
1673 - a string using Unix path syntax, for example:
1674 'storage_1/storage_1.2/stream'
1675 - a list of storage filenames, path to the desired stream/storage.
1676 Example: ['storage_1', 'storage_1.2', 'stream']
1677 return: file object (read-only)
1678 raise IOError if filename not found, or if this is not a stream.
1679 """
1680 sid = self._find(filename)
1681 entry = self.direntries[sid]
1682 if entry.entry_type != STGTY_STREAM:
1683 raise IOError, "this file is not a stream"
1684 return self._open(entry.isectStart, entry.size)
1685
1686
1687 def get_type(self, filename):
1688 """
1689 Test if given filename exists as a stream or a storage in the OLE
1690 container, and return its type.
1691
1692 filename: path of stream in storage tree. (see openstream for syntax)
1693 return: False if object does not exist, its entry type (>0) otherwise:
1694 - STGTY_STREAM: a stream
1695 - STGTY_STORAGE: a storage
1696 - STGTY_ROOT: the root entry
1697 """
1698 try:
1699 sid = self._find(filename)
1700 entry = self.direntries[sid]
1701 return entry.entry_type
1702 except:
1703 return False
1704
1705
1706 def getmtime(self, filename):
1707 """
1708 Return modification time of a stream/storage.
1709
1710 filename: path of stream/storage in storage tree. (see openstream for
1711 syntax)
1712 return: None if modification time is null, a python datetime object
1713 otherwise (UTC timezone)
1714
1715 new in version 0.26
1716 """
1717 sid = self._find(filename)
1718 entry = self.direntries[sid]
1719 return entry.getmtime()
1720
1721
1722 def getctime(self, filename):
1723 """
1724 Return creation time of a stream/storage.
1725
1726 filename: path of stream/storage in storage tree. (see openstream for
1727 syntax)
1728 return: None if creation time is null, a python datetime object
1729 otherwise (UTC timezone)
1730
1731 new in version 0.26
1732 """
1733 sid = self._find(filename)
1734 entry = self.direntries[sid]
1735 return entry.getctime()
1736
1737
1738 def exists(self, filename):
1739 """
1740 Test if given filename exists as a stream or a storage in the OLE
1741 container.
1742
1743 filename: path of stream in storage tree. (see openstream for syntax)
1744 return: True if object exist, else False.
1745 """
1746 try:
1747 sid = self._find(filename)
1748 return True
1749 except:
1750 return False
1751
1752
1753 def get_size(self, filename):
1754 """
1755 Return size of a stream in the OLE container, in bytes.
1756
1757 filename: path of stream in storage tree (see openstream for syntax)
1758 return: size in bytes (long integer)
1759 raise: IOError if file not found, TypeError if this is not a stream.
1760 """
1761 sid = self._find(filename)
1762 entry = self.direntries[sid]
1763 if entry.entry_type != STGTY_STREAM:
1764 #TODO: Should it return zero instead of raising an exception ?
1765 raise TypeError, 'object is not an OLE stream'
1766 return entry.size
1767
1768
1769 def get_rootentry_name(self):
1770 """
1771 Return root entry name. Should usually be 'Root Entry' or 'R' in most
1772 implementations.
1773 """
1774 return self.root.name
1775
1776
1777 def getproperties(self, filename, convert_time=False, no_conversion=None):
1778 """
1779 Return properties described in substream.
1780
1781 filename: path of stream in storage tree (see openstream for syntax)
1782 convert_time: bool, if True timestamps will be converted to Python datetime
1783 no_conversion: None or list of int, timestamps not to be converted
1784 (for example total editing time is not a real timestamp)
1785 return: a dictionary of values indexed by id (integer)
1786 """
1787 # make sure no_conversion is a list, just to simplify code below:
1788 if no_conversion == None:
1789 no_conversion = []
1790 # stream path as a string to report exceptions:
1791 streampath = filename
1792 if not isinstance(streampath, str):
1793 streampath = '/'.join(streampath)
1794
1795 fp = self.openstream(filename)
1796
1797 data = {}
1798
1799 try:
1800 # header
1801 s = fp.read(28)
1802 clsid = _clsid(s[8:24])
1803
1804 # format id
1805 s = fp.read(20)
1806 fmtid = _clsid(s[:16])
1807 fp.seek(i32(s, 16))
1808
1809 # get section
1810 s = "****" + fp.read(i32(fp.read(4))-4)
1811 # number of properties:
1812 num_props = i32(s, 4)
1813 except:
1814 # catch exception while parsing property header, and only raise
1815 # a DEFECT_INCORRECT then return an empty dict, because this is not
1816 # a fatal error when parsing the whole file
1817 exctype, excvalue = sys.exc_info()[:2]
1818 msg = 'Error while parsing properties header in stream %s: %s' % (
1819 repr(streampath), excvalue)
1820 self._raise_defect(DEFECT_INCORRECT, msg, exctype)
1821 return data
1822
1823 for i in range(num_props):
1824 try:
1825 id = 0 # just in case of an exception
1826 id = i32(s, 8+i*8)
1827 offset = i32(s, 12+i*8)
1828 type = i32(s, offset)
1829
1830 debug ('property id=%d: type=%d offset=%X' % (id, type, offset))
1831
1832 # test for common types first (should perhaps use
1833 # a dictionary instead?)
1834
1835 if type == VT_I2: # 16-bit signed integer
1836 value = i16(s, offset+4)
1837 if value >= 32768:
1838 value = value - 65536
1839 elif type == VT_UI2: # 2-byte unsigned integer
1840 value = i16(s, offset+4)
1841 elif type in (VT_I4, VT_INT, VT_ERROR):
1842 # VT_I4: 32-bit signed integer
1843 # VT_ERROR: HRESULT, similar to 32-bit signed integer,
1844 # see http://msdn.microsoft.com/en-us/library/cc230330.aspx
1845 value = i32(s, offset+4)
1846 elif type in (VT_UI4, VT_UINT): # 4-byte unsigned integer
1847 value = i32(s, offset+4) # FIXME
1848 elif type in (VT_BSTR, VT_LPSTR):
1849 # CodePageString, see http://msdn.microsoft.com/en-us/library/dd942354.aspx
1850 # size is a 32 bits integer, including the null terminator, and
1851 # possibly trailing or embedded null chars
1852 #TODO: if codepage is unicode, the string should be converted as such
1853 count = i32(s, offset+4)
1854 value = s[offset+8:offset+8+count-1]
1855 # remove all null chars:
1856 value = value.replace('\x00', '')
1857 elif type == VT_BLOB:
1858 # binary large object (BLOB)
1859 # see http://msdn.microsoft.com/en-us/library/dd942282.aspx
1860 count = i32(s, offset+4)
1861 value = s[offset+8:offset+8+count]
1862 elif type == VT_LPWSTR:
1863 # UnicodeString
1864 # see http://msdn.microsoft.com/en-us/library/dd942313.aspx
1865 # "the string should NOT contain embedded or additional trailing
1866 # null characters."
1867 count = i32(s, offset+4)
1868 value = _unicode(s[offset+8:offset+8+count*2])
1869 elif type == VT_FILETIME:
1870 value = long(i32(s, offset+4)) + (long(i32(s, offset+8))<<32)
1871 # FILETIME is a 64-bit int: "number of 100ns periods
1872 # since Jan 1,1601".
1873 if convert_time and id not in no_conversion:
1874 debug('Converting property #%d to python datetime, value=%d=%fs'
1875 %(id, value, float(value)/10000000L))
1876 # convert FILETIME to Python datetime.datetime
1877 # inspired from http://code.activestate.com/recipes/511425-filetime-to-datetime/
1878 _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0)
1879 debug('timedelta days=%d' % (value/(10*1000000*3600*24)))
1880 value = _FILETIME_null_date + datetime.timedelta(microseconds=value/10)
1881 else:
1882 # legacy code kept for backward compatibility: returns a
1883 # number of seconds since Jan 1,1601
1884 value = value / 10000000L # seconds
1885 elif type == VT_UI1: # 1-byte unsigned integer
1886 value = ord(s[offset+4])
1887 elif type == VT_CLSID:
1888 value = _clsid(s[offset+4:offset+20])
1889 elif type == VT_CF:
1890 # PropertyIdentifier or ClipboardData??
1891 # see http://msdn.microsoft.com/en-us/library/dd941945.aspx
1892 count = i32(s, offset+4)
1893 value = s[offset+8:offset+8+count]
1894 elif type == VT_BOOL:
1895 # VARIANT_BOOL, 16 bits bool, 0x0000=Fals, 0xFFFF=True
1896 # see http://msdn.microsoft.com/en-us/library/cc237864.aspx
1897 value = bool(i16(s, offset+4))
1898 else:
1899 value = None # everything else yields "None"
1900 debug ('property id=%d: type=%d not implemented in parser yet' % (id, type))
1901
1902 # missing: VT_EMPTY, VT_NULL, VT_R4, VT_R8, VT_CY, VT_DATE,
1903 # VT_DECIMAL, VT_I1, VT_I8, VT_UI8,
1904 # see http://msdn.microsoft.com/en-us/library/dd942033.aspx
1905
1906 # FIXME: add support for VT_VECTOR
1907 # VT_VECTOR is a 32 uint giving the number of items, followed by
1908 # the items in sequence. The VT_VECTOR value is combined with the
1909 # type of items, e.g. VT_VECTOR|VT_BSTR
1910 # see http://msdn.microsoft.com/en-us/library/dd942011.aspx
1911
1912 #print "%08x" % id, repr(value),
1913 #print "(%s)" % VT[i32(s, offset) & 0xFFF]
1914
1915 data[id] = value
1916 except:
1917 # catch exception while parsing each property, and only raise
1918 # a DEFECT_INCORRECT, because parsing can go on
1919 exctype, excvalue = sys.exc_info()[:2]
1920 msg = 'Error while parsing property id %d in stream %s: %s' % (
1921 id, repr(streampath), excvalue)
1922 self._raise_defect(DEFECT_INCORRECT, msg, exctype)
1923
1924 return data
1925
1926 def get_metadata(self):
1927 """
1928 Parse standard properties streams, return an OleMetadata object
1929 containing all the available metadata.
1930 (also stored in the metadata attribute of the OleFileIO object)
1931
1932 new in version 0.25
1933 """
1934 self.metadata = OleMetadata()
1935 self.metadata.parse_properties(self)
1936 return self.metadata
1937
1938#
1939# --------------------------------------------------------------------
1940# This script can be used to dump the directory of any OLE2 structured
1941# storage file.
1942
1943if __name__ == "__main__":
1944
1945 import sys
1946
1947 # [PL] display quick usage info if launched from command-line
1948 if len(sys.argv) <= 1:
1949 print __doc__
1950 print """
1951Launched from command line, this script parses OLE files and prints info.
1952
1953Usage: OleFileIO_PL.py [-d] [-c] <file> [file2 ...]
1954
1955Options:
1956-d : debug mode (display a lot of debug information, for developers only)
1957-c : check all streams (for debugging purposes)
1958"""
1959 sys.exit()
1960
1961 check_streams = False
1962 for filename in sys.argv[1:]:
1963## try:
1964 # OPTIONS:
1965 if filename == '-d':
1966 # option to switch debug mode on:
1967 set_debug_mode(True)
1968 continue
1969 if filename == '-c':
1970 # option to switch check streams mode on:
1971 check_streams = True
1972 continue
1973
1974 ole = OleFileIO(filename)#, raise_defects=DEFECT_INCORRECT)
1975 print "-" * 68
1976 print filename
1977 print "-" * 68
1978 ole.dumpdirectory()
1979 for streamname in ole.listdir():
1980 if streamname[-1][0] == "\005":
1981 print streamname, ": properties"
1982 props = ole.getproperties(streamname, convert_time=True)
1983 props = props.items()
1984 props.sort()
1985 for k, v in props:
1986 #[PL]: avoid to display too large or binary values:
1987 if isinstance(v, basestring):
1988 if len(v) > 50:
1989 v = v[:50]
1990 # quick and dirty binary check:
1991 for c in (1,2,3,4,5,6,7,11,12,14,15,16,17,18,19,20,
1992 21,22,23,24,25,26,27,28,29,30,31):
1993 if chr(c) in v:
1994 v = '(binary data)'
1995 break
1996 print " ", k, v
1997
1998 if check_streams:
1999 # Read all streams to check if there are errors:
2000 print '\nChecking streams...'
2001 for streamname in ole.listdir():
2002 # print name using repr() to convert binary chars to \xNN:
2003 print '-', repr('/'.join(streamname)),'-',
2004 st_type = ole.get_type(streamname)
2005 if st_type == STGTY_STREAM:
2006 print 'size %d' % ole.get_size(streamname)
2007 # just try to read stream in memory:
2008 ole.openstream(streamname)
2009 else:
2010 print 'NOT a stream : type=%d' % st_type
2011 print ''
2012
2013## for streamname in ole.listdir():
2014## # print name using repr() to convert binary chars to \xNN:
2015## print '-', repr('/'.join(streamname)),'-',
2016## print ole.getmtime(streamname)
2017## print ''
2018
2019 print 'Modification/Creation times of all directory entries:'
2020 for entry in ole.direntries:
2021 if entry is not None:
2022 print '- %s: mtime=%s ctime=%s' % (entry.name,
2023 entry.getmtime(), entry.getctime())
2024 print ''
2025
2026 # parse and display metadata:
2027 meta = ole.get_metadata()
2028 meta.dump()
2029 print ''
2030 #[PL] Test a few new methods:
2031 root = ole.get_rootentry_name()
2032 print 'Root entry name: "%s"' % root
2033 if ole.exists('worddocument'):
2034 print "This is a Word document."
2035 print "type of stream 'WordDocument':", ole.get_type('worddocument')
2036 print "size :", ole.get_size('worddocument')
2037 if ole.exists('macros/vba'):
2038 print "This document may contain VBA macros."
2039
2040 # print parsing issues:
2041 print '\nNon-fatal issues raised during parsing:'
2042 if ole.parsing_issues:
2043 for exctype, msg in ole.parsing_issues:
2044 print '- %s: %s' % (exctype.__name__, msg)
2045 else:
2046 print 'None'
2047## except IOError, v:
2048## print "***", "cannot read", file, "-", v