git-repo/import_zip.py

#
# Copyright (C) 2008 The Android Open Source Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import stat
import struct
import zlib
import cStringIO

from import_ext import ImportExternal
from error import ImportError

class ImportZip(ImportExternal):
  """Streams a zip file from the network directly into a Project's
     Git repository.
  """
  @classmethod
  def CanAccept(cls, url):
    """Can this importer read and unpack the data stored at url?
    """
    if url.endswith('.zip') or url.endswith('.jar'):
      return True
    return False

  def _UnpackFiles(self):
    url_fd, url = self._OpenUrl()
    try:
      if not self.__class__.CanAccept(url):
        raise ImportError('non-zip file extension: %s' % url)

      zip = _ZipFile(url_fd)
      for entry in zip.FileRecords():
        data = zip.Open(entry).read()
        sz = len(data)

        if data and _SafeCRLF(data):
          data = data.replace('\r\n', '\n')
          sz = len(data)

        fd = cStringIO.StringIO(data)
        self._UnpackOneFile(entry.mode, sz, entry.name, fd)
        zip.Close(entry)

      for entry in zip.CentralDirectory():
        self._SetFileMode(entry.name, entry.mode)

      zip.CheckTail()
    finally:
      url_fd.close()


def _SafeCRLF(data):
  """Is it reasonably safe to perform a CRLF->LF conversion?

     If the stream contains a NUL byte it is likely binary,
     and thus a CRLF->LF conversion may damage the stream.

     If the only NUL is in the last position of the stream,
     but it otherwise can do a CRLF<->LF conversion we do
     the CRLF conversion anyway.  At least one source ZIP
     file has this structure in its source code.

     If every occurrance of a CR and LF is paired up as a
     CRLF pair then the conversion is safely bi-directional.
     s/\r\n/\n/g == s/\n/\r\\n/g can convert between them.
  """
  nul = data.find('\0')
  if 0 <= nul and nul < (len(data) - 1):
    return False

  n_lf = 0
  last = 0
  while True:
    lf = data.find('\n', last)
    if lf < 0:
      break
    if lf == 0 or data[lf - 1] != '\r':
      return False
    last = lf + 1
    n_lf += 1
  return n_lf > 0

class _ZipFile(object):
  """Streaming iterator to parse a zip file on the fly.
  """
  def __init__(self, fd):
    self._fd = _UngetStream(fd)

  def FileRecords(self):
    return _FileIter(self._fd)

  def CentralDirectory(self):
    return _CentIter(self._fd)

  def CheckTail(self):
    type_buf = self._fd.read(4)
    type = struct.unpack('<I', type_buf)[0]
    if type != 0x06054b50:  # end of central directory
      raise ImportError('zip record %x unsupported' % type)

  def Open(self, entry):
    if entry.is_compressed:
      return _InflateStream(self._fd)
    else:
      if entry.has_trailer:
        raise ImportError('unable to extract streamed zip')
      return _FixedLengthStream(self._fd, entry.uncompressed_size)

  def Close(self, entry):
    if entry.has_trailer:
      type = struct.unpack('<I', self._fd.read(4))[0]
      if type == 0x08074b50:
        # Not a formal type marker, but commonly seen in zips
        # as the data descriptor signature.
        #
        struct.unpack('<3I', self._fd.read(12))
      else:
        # No signature for the data descriptor, so read the
        # remaining fields out of the stream
        #
        self._fd.read(8)


class _FileIter(object):
  def __init__(self, fd):
    self._fd = fd

  def __iter__(self):
    return self

  def next(self):
    fd = self._fd

    type_buf = fd.read(4)
    type = struct.unpack('<I', type_buf)[0]

    if type != 0x04034b50:    # local file header
      fd.unread(type_buf)
      raise StopIteration()

    rec = _FileHeader(fd.read(26))
    rec.name = fd.read(rec.name_len)
    fd.read(rec.extra_len)

    if rec.name.endswith('/'):
      rec.name = rec.name[:-1]
      rec.mode = stat.S_IFDIR | 0777
    return rec


class _FileHeader(object):
  """Information about a single file in the archive.
     0  version needed to extract       2 bytes
     1  general purpose bit flag        2 bytes
     2  compression method              2 bytes
     3  last mod file time              2 bytes
     4  last mod file date              2 bytes
     5  crc-32                          4 bytes
     6  compressed size                 4 bytes
     7  uncompressed size               4 bytes
     8  file name length                2 bytes
     9  extra field length              2 bytes
  """
  def __init__(self, raw_bin):
    rec = struct.unpack('<5H3I2H', raw_bin)
    
    if rec[2] == 8:
      self.is_compressed = True
    elif rec[2] == 0:
      self.is_compressed = False
    else:
      raise ImportError('unrecognized compression format')

    if rec[1] & (1 << 3):
      self.has_trailer = True
    else:
      self.has_trailer = False

    self.compressed_size  = rec[6]
    self.uncompressed_size = rec[7]
    self.name_len = rec[8]
    self.extra_len = rec[9]
    self.mode = stat.S_IFREG | 0644


class _CentIter(object):
  def __init__(self, fd):
    self._fd = fd

  def __iter__(self):
    return self

  def next(self):
    fd = self._fd

    type_buf = fd.read(4)
    type = struct.unpack('<I', type_buf)[0]

    if type != 0x02014b50:  # central directory
      fd.unread(type_buf)
      raise StopIteration()

    rec = _CentHeader(fd.read(42))
    rec.name = fd.read(rec.name_len)
    fd.read(rec.extra_len)
    fd.read(rec.comment_len)

    if rec.name.endswith('/'):
      rec.name = rec.name[:-1]
      rec.mode = stat.S_IFDIR | 0777
    return rec


class _CentHeader(object):
  """Information about a single file in the archive.
     0  version made by                 2 bytes
     1  version needed to extract       2 bytes
     2  general purpose bit flag        2 bytes
     3  compression method              2 bytes
     4  last mod file time              2 bytes
     5  last mod file date              2 bytes
     6  crc-32                          4 bytes
     7  compressed size                 4 bytes
     8  uncompressed size               4 bytes
     9  file name length                2 bytes
    10  extra field length              2 bytes
    11  file comment length             2 bytes
    12  disk number start               2 bytes
    13  internal file attributes        2 bytes
    14  external file attributes        4 bytes
    15  relative offset of local header 4 bytes
  """
  def __init__(self, raw_bin):
    rec = struct.unpack('<6H3I5H2I', raw_bin)
    self.name_len = rec[9]
    self.extra_len = rec[10]
    self.comment_len = rec[11]

    if (rec[0] & 0xff00) == 0x0300:  # UNIX
      self.mode = rec[14] >> 16
    else:
      self.mode = stat.S_IFREG | 0644


class _UngetStream(object):
  """File like object to read and rewind a stream.
  """
  def __init__(self, fd):
    self._fd = fd
    self._buf = None

  def read(self, size = -1):
    r = []
    try:
      if size >= 0:
        self._ReadChunk(r, size)
      else:
        while True:
          self._ReadChunk(r, 2048)
    except EOFError:
      pass

    if len(r) == 1:
      return r[0]
    return ''.join(r)

  def unread(self, buf):
    b = self._buf
    if b is None or len(b) == 0:
      self._buf = buf
    else:
      self._buf = buf + b

  def _ReadChunk(self, r, size):
    b = self._buf
    try:
      while size > 0:
        if b is None or len(b) == 0:
          b = self._Inflate(self._fd.read(2048))
          if not b:
            raise EOFError()
          continue

        use = min(size, len(b))
        r.append(b[:use])
        b = b[use:]
        size -= use
    finally:
      self._buf = b

  def _Inflate(self, b):
    return b


class _FixedLengthStream(_UngetStream):
  """File like object to read a fixed length stream.
  """
  def __init__(self, fd, have):
    _UngetStream.__init__(self, fd)
    self._have = have

  def _Inflate(self, b):
    n = self._have
    if n == 0:
      self._fd.unread(b)
      return None

    if len(b) > n:
      self._fd.unread(b[n:])
      b = b[:n]
    self._have -= len(b)
    return b


class _InflateStream(_UngetStream):
  """Inflates the stream as it reads input.
  """
  def __init__(self, fd):
    _UngetStream.__init__(self, fd)
    self._z = zlib.decompressobj(-zlib.MAX_WBITS)

  def _Inflate(self, b):
    z = self._z
    if not z:
      self._fd.unread(b)
      return None

    b = z.decompress(b)
    if z.unconsumed_tail != '':
      self._fd.unread(z.unconsumed_tail)
    elif z.unused_data != '':
      self._fd.unread(z.unused_data)
      self._z = None
    return b
Initial Contribution 2008-10-21 14:00:00 +00:00			`#`
			`# Copyright (C) 2008 The Android Open Source Project`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`import stat`
			`import struct`
			`import zlib`
			`import cStringIO`

			`from import_ext import ImportExternal`
			`from error import ImportError`

			`class ImportZip(ImportExternal):`
			`"""Streams a zip file from the network directly into a Project's`
			`Git repository.`
			`"""`
			`@classmethod`
			`def CanAccept(cls, url):`
			`"""Can this importer read and unpack the data stored at url?`
			`"""`
			`if url.endswith('.zip') or url.endswith('.jar'):`
			`return True`
			`return False`

			`def _UnpackFiles(self):`
			`url_fd, url = self._OpenUrl()`
			`try:`
			`if not self.__class__.CanAccept(url):`
			`raise ImportError('non-zip file extension: %s' % url)`

			`zip = _ZipFile(url_fd)`
			`for entry in zip.FileRecords():`
			`data = zip.Open(entry).read()`
			`sz = len(data)`

			`if data and _SafeCRLF(data):`
			`data = data.replace('\r\n', '\n')`
			`sz = len(data)`

			`fd = cStringIO.StringIO(data)`
			`self._UnpackOneFile(entry.mode, sz, entry.name, fd)`
			`zip.Close(entry)`

			`for entry in zip.CentralDirectory():`
			`self._SetFileMode(entry.name, entry.mode)`

			`zip.CheckTail()`
			`finally:`
			`url_fd.close()`


			`def _SafeCRLF(data):`
			`"""Is it reasonably safe to perform a CRLF->LF conversion?`

			`If the stream contains a NUL byte it is likely binary,`
			`and thus a CRLF->LF conversion may damage the stream.`

			`If the only NUL is in the last position of the stream,`
			`but it otherwise can do a CRLF<->LF conversion we do`
			`the CRLF conversion anyway. At least one source ZIP`
			`file has this structure in its source code.`

			`If every occurrance of a CR and LF is paired up as a`
			`CRLF pair then the conversion is safely bi-directional.`
			`s/\r\n/\n/g == s/\n/\r\\n/g can convert between them.`
			`"""`
			`nul = data.find('\0')`
			`if 0 <= nul and nul < (len(data) - 1):`
			`return False`

			`n_lf = 0`
			`last = 0`
			`while True:`
			`lf = data.find('\n', last)`
			`if lf < 0:`
			`break`
			`if lf == 0 or data[lf - 1] != '\r':`
			`return False`
			`last = lf + 1`
			`n_lf += 1`
			`return n_lf > 0`

			`class _ZipFile(object):`
			`"""Streaming iterator to parse a zip file on the fly.`
			`"""`
			`def __init__(self, fd):`
			`self._fd = _UngetStream(fd)`

			`def FileRecords(self):`
			`return _FileIter(self._fd)`

			`def CentralDirectory(self):`
			`return _CentIter(self._fd)`

			`def CheckTail(self):`
			`type_buf = self._fd.read(4)`
			`type = struct.unpack('<I', type_buf)[0]`
			`if type != 0x06054b50: # end of central directory`
			`raise ImportError('zip record %x unsupported' % type)`

			`def Open(self, entry):`
			`if entry.is_compressed:`
			`return _InflateStream(self._fd)`
			`else:`
			`if entry.has_trailer:`
			`raise ImportError('unable to extract streamed zip')`
			`return _FixedLengthStream(self._fd, entry.uncompressed_size)`

			`def Close(self, entry):`
			`if entry.has_trailer:`
			`type = struct.unpack('<I', self._fd.read(4))[0]`
			`if type == 0x08074b50:`
			`# Not a formal type marker, but commonly seen in zips`
			`# as the data descriptor signature.`
			`#`
			`struct.unpack('<3I', self._fd.read(12))`
			`else:`
			`# No signature for the data descriptor, so read the`
			`# remaining fields out of the stream`
			`#`
			`self._fd.read(8)`


			`class _FileIter(object):`
			`def __init__(self, fd):`
			`self._fd = fd`

			`def __iter__(self):`
			`return self`

			`def next(self):`
			`fd = self._fd`

			`type_buf = fd.read(4)`
			`type = struct.unpack('<I', type_buf)[0]`

			`if type != 0x04034b50: # local file header`
			`fd.unread(type_buf)`
			`raise StopIteration()`

			`rec = _FileHeader(fd.read(26))`
			`rec.name = fd.read(rec.name_len)`
			`fd.read(rec.extra_len)`

			`if rec.name.endswith('/'):`
			`rec.name = rec.name[:-1]`
			`rec.mode = stat.S_IFDIR \| 0777`
			`return rec`


			`class _FileHeader(object):`
			`"""Information about a single file in the archive.`
			`0 version needed to extract 2 bytes`
			`1 general purpose bit flag 2 bytes`
			`2 compression method 2 bytes`
			`3 last mod file time 2 bytes`
			`4 last mod file date 2 bytes`
			`5 crc-32 4 bytes`
			`6 compressed size 4 bytes`
			`7 uncompressed size 4 bytes`
			`8 file name length 2 bytes`
			`9 extra field length 2 bytes`
			`"""`
			`def __init__(self, raw_bin):`
			`rec = struct.unpack('<5H3I2H', raw_bin)`

			`if rec[2] == 8:`
			`self.is_compressed = True`
			`elif rec[2] == 0:`
			`self.is_compressed = False`
			`else:`
			`raise ImportError('unrecognized compression format')`

			`if rec[1] & (1 << 3):`
			`self.has_trailer = True`
			`else:`
			`self.has_trailer = False`

			`self.compressed_size = rec[6]`
			`self.uncompressed_size = rec[7]`
			`self.name_len = rec[8]`
			`self.extra_len = rec[9]`
			`self.mode = stat.S_IFREG \| 0644`


			`class _CentIter(object):`
			`def __init__(self, fd):`
			`self._fd = fd`

			`def __iter__(self):`
			`return self`

			`def next(self):`
			`fd = self._fd`

			`type_buf = fd.read(4)`
			`type = struct.unpack('<I', type_buf)[0]`

			`if type != 0x02014b50: # central directory`
			`fd.unread(type_buf)`
			`raise StopIteration()`

			`rec = _CentHeader(fd.read(42))`
			`rec.name = fd.read(rec.name_len)`
			`fd.read(rec.extra_len)`
			`fd.read(rec.comment_len)`

			`if rec.name.endswith('/'):`
			`rec.name = rec.name[:-1]`
			`rec.mode = stat.S_IFDIR \| 0777`
			`return rec`


			`class _CentHeader(object):`
			`"""Information about a single file in the archive.`
			`0 version made by 2 bytes`
			`1 version needed to extract 2 bytes`
			`2 general purpose bit flag 2 bytes`
			`3 compression method 2 bytes`
			`4 last mod file time 2 bytes`
			`5 last mod file date 2 bytes`
			`6 crc-32 4 bytes`
			`7 compressed size 4 bytes`
			`8 uncompressed size 4 bytes`
			`9 file name length 2 bytes`
			`10 extra field length 2 bytes`
			`11 file comment length 2 bytes`
			`12 disk number start 2 bytes`
			`13 internal file attributes 2 bytes`
			`14 external file attributes 4 bytes`
			`15 relative offset of local header 4 bytes`
			`"""`
			`def __init__(self, raw_bin):`
			`rec = struct.unpack('<6H3I5H2I', raw_bin)`
			`self.name_len = rec[9]`
			`self.extra_len = rec[10]`
			`self.comment_len = rec[11]`

			`if (rec[0] & 0xff00) == 0x0300: # UNIX`
			`self.mode = rec[14] >> 16`
			`else:`
			`self.mode = stat.S_IFREG \| 0644`


			`class _UngetStream(object):`
			`"""File like object to read and rewind a stream.`
			`"""`
			`def __init__(self, fd):`
			`self._fd = fd`
			`self._buf = None`

			`def read(self, size = -1):`
			`r = []`
			`try:`
			`if size >= 0:`
			`self._ReadChunk(r, size)`
			`else:`
			`while True:`
			`self._ReadChunk(r, 2048)`
			`except EOFError:`
			`pass`

			`if len(r) == 1:`
			`return r[0]`
			`return ''.join(r)`

			`def unread(self, buf):`
			`b = self._buf`
			`if b is None or len(b) == 0:`
			`self._buf = buf`
			`else:`
			`self._buf = buf + b`

			`def _ReadChunk(self, r, size):`
			`b = self._buf`
			`try:`
			`while size > 0:`
			`if b is None or len(b) == 0:`
			`b = self._Inflate(self._fd.read(2048))`
			`if not b:`
			`raise EOFError()`
			`continue`

			`use = min(size, len(b))`
			`r.append(b[:use])`
			`b = b[use:]`
			`size -= use`
			`finally:`
			`self._buf = b`

			`def _Inflate(self, b):`
			`return b`


			`class _FixedLengthStream(_UngetStream):`
			`"""File like object to read a fixed length stream.`
			`"""`
			`def __init__(self, fd, have):`
			`_UngetStream.__init__(self, fd)`
			`self._have = have`

			`def _Inflate(self, b):`
			`n = self._have`
			`if n == 0:`
			`self._fd.unread(b)`
			`return None`

			`if len(b) > n:`
			`self._fd.unread(b[n:])`
			`b = b[:n]`
			`self._have -= len(b)`
			`return b`


			`class _InflateStream(_UngetStream):`
			`"""Inflates the stream as it reads input.`
			`"""`
			`def __init__(self, fd):`
			`_UngetStream.__init__(self, fd)`
			`self._z = zlib.decompressobj(-zlib.MAX_WBITS)`

			`def _Inflate(self, b):`
			`z = self._z`
			`if not z:`
			`self._fd.unread(b)`
			`return None`

			`b = z.decompress(b)`
			`if z.unconsumed_tail != '':`
			`self._fd.unread(z.unconsumed_tail)`
			`elif z.unused_data != '':`
			`self._fd.unread(z.unused_data)`
			`self._z = None`
			`return b`