git-repo/import_zip.py

#
# Copyright (C) 2008 The Android Open Source Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import stat
import struct
import zlib
import cStringIO

from import_ext import ImportExternal
from error import ImportError

class ImportZip(ImportExternal):
  """Streams a zip file from the network directly into a Project's
     Git repository.
  """
  @classmethod
  def CanAccept(cls, url):
    """Can this importer read and unpack the data stored at url?
    """
    if url.endswith('.zip') or url.endswith('.jar'):
      return True
    return False

  def _UnpackFiles(self):
    url_fd, url = self._OpenUrl()
    try:
      if not self.__class__.CanAccept(url):
        raise ImportError('non-zip file extension: %s' % url)

      zip = _ZipFile(url_fd)
      for entry in zip.FileRecords():
        data = zip.Open(entry).read()
        sz = len(data)

        if data and _SafeCRLF(data):
          data = data.replace('\r\n', '\n')
          sz = len(data)

        fd = cStringIO.StringIO(data)
        self._UnpackOneFile(entry.mode, sz, entry.name, fd)
        zip.Close(entry)

      for entry in zip.CentralDirectory():
        self._SetFileMode(entry.name, entry.mode)

      zip.CheckTail()
    finally:
      url_fd.close()


def _SafeCRLF(data):
  """Is it reasonably safe to perform a CRLF->LF conversion?

     If the stream contains a NUL byte it is likely binary,
     and thus a CRLF->LF conversion may damage the stream.

     If the only NUL is in the last position of the stream,
     but it otherwise can do a CRLF<->LF conversion we do
     the CRLF conversion anyway.  At least one source ZIP
     file has this structure in its source code.

     If every occurrance of a CR and LF is paired up as a
     CRLF pair then the conversion is safely bi-directional.
     s/\r\n/\n/g == s/\n/\r\\n/g can convert between them.
  """
  nul = data.find('\0')
  if 0 <= nul and nul < (len(data) - 1):
    return False

  n_lf = 0
  last = 0
  while True:
    lf = data.find('\n', last)
    if lf < 0:
      break
    if lf == 0 or data[lf - 1] != '\r':
      return False
    last = lf + 1
    n_lf += 1
  return n_lf > 0

class _ZipFile(object):
  """Streaming iterator to parse a zip file on the fly.
  """
  def __init__(self, fd):
    self._fd = _UngetStream(fd)

  def FileRecords(self):
    return _FileIter(self._fd)

  def CentralDirectory(self):
    return _CentIter(self._fd)

  def CheckTail(self):
    type_buf = self._fd.read(4)
    type = struct.unpack('<I', type_buf)[0]
    if type != 0x06054b50:  # end of central directory
      raise ImportError('zip record %x unsupported' % type)

  def Open(self, entry):
    if entry.is_compressed:
      return _InflateStream(self._fd)
    else:
      if entry.has_trailer:
        raise ImportError('unable to extract streamed zip')
      return _FixedLengthStream(self._fd, entry.uncompressed_size)

  def Close(self, entry):
    if entry.has_trailer:
      type = struct.unpack('<I', self._fd.read(4))[0]
      if type == 0x08074b50:
        # Not a formal type marker, but commonly seen in zips
        # as the data descriptor signature.
        #
        struct.unpack('<3I', self._fd.read(12))
      else:
        # No signature for the data descriptor, so read the
        # remaining fields out of the stream
        #
        self._fd.read(8)


class _FileIter(object):
  def __init__(self, fd):
    self._fd = fd

  def __iter__(self):
    return self

  def next(self):
    fd = self._fd

    type_buf = fd.read(4)
    type = struct.unpack('<I', type_buf)[0]

    if type != 0x04034b50:    # local file header
      fd.unread(type_buf)
      raise StopIteration()

    rec = _FileHeader(fd.read(26))
    rec.name = fd.read(rec.name_len)
    fd.read(rec.extra_len)

    if rec.name.endswith('/'):
      rec.name = rec.name[:-1]
      rec.mode = stat.S_IFDIR | 0777
    return rec


class _FileHeader(object):
  """Information about a single file in the archive.
     0  version needed to extract       2 bytes
     1  general purpose bit flag        2 bytes
     2  compression method              2 bytes
     3  last mod file time              2 bytes
     4  last mod file date              2 bytes
     5  crc-32                          4 bytes
     6  compressed size                 4 bytes
     7  uncompressed size               4 bytes
     8  file name length                2 bytes
     9  extra field length              2 bytes
  """
  def __init__(self, raw_bin):
    rec = struct.unpack('<5H3I2H', raw_bin)

    if rec[2] == 8:
      self.is_compressed = True
    elif rec[2] == 0:
      self.is_compressed = False
    else:
      raise ImportError('unrecognized compression format')

    if rec[1] & (1 << 3):
      self.has_trailer = True
    else:
      self.has_trailer = False

    self.compressed_size  = rec[6]
    self.uncompressed_size = rec[7]
    self.name_len = rec[8]
    self.extra_len = rec[9]
    self.mode = stat.S_IFREG | 0644


class _CentIter(object):
  def __init__(self, fd):
    self._fd = fd

  def __iter__(self):
    return self

  def next(self):
    fd = self._fd

    type_buf = fd.read(4)
    type = struct.unpack('<I', type_buf)[0]

    if type != 0x02014b50:  # central directory
      fd.unread(type_buf)
      raise StopIteration()

    rec = _CentHeader(fd.read(42))
    rec.name = fd.read(rec.name_len)
    fd.read(rec.extra_len)
    fd.read(rec.comment_len)

    if rec.name.endswith('/'):
      rec.name = rec.name[:-1]
      rec.mode = stat.S_IFDIR | 0777
    return rec


class _CentHeader(object):
  """Information about a single file in the archive.
     0  version made by                 2 bytes
     1  version needed to extract       2 bytes
     2  general purpose bit flag        2 bytes
     3  compression method              2 bytes
     4  last mod file time              2 bytes
     5  last mod file date              2 bytes
     6  crc-32                          4 bytes
     7  compressed size                 4 bytes
     8  uncompressed size               4 bytes
     9  file name length                2 bytes
    10  extra field length              2 bytes
    11  file comment length             2 bytes
    12  disk number start               2 bytes
    13  internal file attributes        2 bytes
    14  external file attributes        4 bytes
    15  relative offset of local header 4 bytes
  """
  def __init__(self, raw_bin):
    rec = struct.unpack('<6H3I5H2I', raw_bin)
    self.name_len = rec[9]
    self.extra_len = rec[10]
    self.comment_len = rec[11]

    if (rec[0] & 0xff00) == 0x0300:  # UNIX
      self.mode = rec[14] >> 16
    else:
      self.mode = stat.S_IFREG | 0644


class _UngetStream(object):
  """File like object to read and rewind a stream.
  """
  def __init__(self, fd):
    self._fd = fd
    self._buf = None

  def read(self, size = -1):
    r = []
    try:
      if size >= 0:
        self._ReadChunk(r, size)
      else:
        while True:
          self._ReadChunk(r, 2048)
    except EOFError:
      pass

    if len(r) == 1:
      return r[0]
    return ''.join(r)

  def unread(self, buf):
    b = self._buf
    if b is None or len(b) == 0:
      self._buf = buf
    else:
      self._buf = buf + b

  def _ReadChunk(self, r, size):
    b = self._buf
    try:
      while size > 0:
        if b is None or len(b) == 0:
          b = self._Inflate(self._fd.read(2048))
          if not b:
            raise EOFError()
          continue

        use = min(size, len(b))
        r.append(b[:use])
        b = b[use:]
        size -= use
    finally:
      self._buf = b

  def _Inflate(self, b):
    return b


class _FixedLengthStream(_UngetStream):
  """File like object to read a fixed length stream.
  """
  def __init__(self, fd, have):
    _UngetStream.__init__(self, fd)
    self._have = have

  def _Inflate(self, b):
    n = self._have
    if n == 0:
      self._fd.unread(b)
      return None

    if len(b) > n:
      self._fd.unread(b[n:])
      b = b[:n]
    self._have -= len(b)
    return b


class _InflateStream(_UngetStream):
  """Inflates the stream as it reads input.
  """
  def __init__(self, fd):
    _UngetStream.__init__(self, fd)
    self._z = zlib.decompressobj(-zlib.MAX_WBITS)

  def _Inflate(self, b):
    z = self._z
    if not z:
      self._fd.unread(b)
      return None

    b = z.decompress(b)
    if z.unconsumed_tail != '':
      self._fd.unread(z.unconsumed_tail)
    elif z.unused_data != '':
      self._fd.unread(z.unused_data)
      self._z = None
    return b