#! /usr/bin/python
#
# cvs2svn: ...
#

import rcsparse
import os
import sys
import time
import fileinput
import string
import getopt
import stat
import md5
import shutil
import anydbm
import marshal

DATAFILE		= 'bkcvs2svn-data'
DUMPFILE		= 'bkcvs2svn-dump'

HEAD_MIRROR_FILE	= 'bkcvs2svn-tree.db'  # Mirror the head tree
STATE_FILE		= 'bkcvs2svn-state.db'

REVS_SUFFIX		= '.revs'
SORTED_REVS_SUFFIX	= '.s-revs'

OP_DELETE = 'D'
OP_CHANGE = 'C'

verbose = 0

changesets = { }
cset_tags = { }

class BKcvs2svn_State:
  def __init__(self):
    'State of system run.'
    self.db = anydbm.open(STATE_FILE, 'c')

    if not self.db.has_key('lastcrev'):
      self.db['lastcrev'] = marshal.dumps(0)

    if not self.db.has_key('last_ts'):
      self.db['last_ts'] = marshal.dumps(0)

    self.rev = marshal.loads(self.db['lastcrev']) + 1

  def get_rev(self):
    return self.rev

  def incr_rev(self):
    self.db['lastcrev'] = marshal.dumps(self.rev)
    self.rev += 1

  def get_ts(self):
    return marshal.loads(self.db['last_ts'])

  def set_ts(self, timestamp):
    self.db['last_ts'] = marshal.dumps(timestamp)

  def get_mtime(self):
    if not self.db.has_key('mtime'):
      return 0
    else:
      return marshal.loads(self.db['mtime'])

  def set_mtime(self, mtime):
    self.db['mtime'] = marshal.dumps(mtime)

  def close(self):
    self.db.close()


class ChangeSetParser(rcsparse.Sink):
  def __init__(self):
    self.authors = {}
    self.tags = {}
    self.timestamp = {}
    self.mtime = 0

  def define_revision(self, revision, timestamp, author, state,
                      branches, next):
    self.authors[revision] = author
    self.timestamp[revision] = int(timestamp)

  def define_tag(self, name, revision):
    self.tags[revision] = name

  def set_revision_info(self, revision, log, text):
    changesets[self.timestamp[revision]] = (self.authors[revision], log)
    if self.tags.has_key(revision):
      cset_tags[self.timestamp[revision]] = string.replace(self.tags[revision], '_', '.')

  def parse_repo(self, rcs_root):
    f_st = os.stat(rcs_root + "/ChangeSet,v")
    self.mtime = f_st[8]
    rcsfile = open(rcs_root + "/ChangeSet,v", 'r')
    rcsparse.Parser().parse(rcsfile, self)
    rcsfile.close()

class CollectData(rcsparse.Sink):
  def __init__(self, ctx):
    self.ctx = ctx
    self.revs = open(DATAFILE + REVS_SUFFIX, 'w')

  def set_fname(self, fname):
    "Prepare to receive data for a new file."
    self.fname = fname
    self.timestamp = {}
    self.op = {}

  def define_revision(self, revision, timestamp, author, state,
                      branches, next):
    ### what else?
    if state == 'dead':
      self.op[revision] = OP_DELETE
    else:
      self.op[revision] = OP_CHANGE

    self.timestamp[revision] = int(timestamp)

  def set_revision_info(self, revision, log, text):
    if self.timestamp[revision] > self.ctx.state.get_ts():
      write_revs_line(self.revs, self.timestamp[revision], self.op[revision],
		      revision, self.fname)

  def close(self):
    self.revs.close()


def relative_name(cvsroot, fname):
  l = len(cvsroot)
  if fname[:l] == cvsroot:
    if fname[l] == '/':
      return fname[l+1:]
    return fname[l:]
  return l


def visit_file(arg, dirname, files):
  cd, p, mtime = arg

  for fname in files:
    if fname == 'BitKeeper':
      files.remove(fname)
      continue

    if fname[-2:] != ',v' or fname == 'ChangeSet,v':
      continue

    pathname = os.path.join(dirname, fname)

    f_st = os.stat(pathname)
    if f_st[8] < mtime:
      continue

    if dirname[-6:] == '/Attic':
      # drop the 'Attic' portion from the pathname
      ### we should record this so we can easily insert it back in
      cd.set_fname(os.path.join(dirname[:-6], fname))
    else:
      cd.set_fname(pathname)
    if verbose:
      print pathname
    p.parse(open(pathname), cd)


class TreeMirror:
  def __init__(self):
    'Open a db file to mirror the head tree.'
    self.db = anydbm.open(HEAD_MIRROR_FILE, 'c')
    if not self.db.has_key('0'):
      self.db['0'] = marshal.dumps({}) # Init as a dir with no entries
    if not self.db.has_key('gen_key_base'):
      self.gen_key_base = 0
      self.db['gen_key_base'] = marshal.dumps(self.gen_key_base)
      self.db[self.gen_key()] = marshal.dumps({})
    else:
      self.gen_key_base = marshal.loads(self.db['gen_key_base'])

  def gen_key(self):
    key = '%x' % self.gen_key_base
    self.gen_key_base = self.gen_key_base + 1
    self.db['gen_key_base'] = marshal.dumps(self.gen_key_base)
    return key

  def ensure_path(self, path, ctx, output_intermediate_dir):
    """Add PATH to the tree.  PATH may not have a leading slash.
    Return None if PATH already existed, else return 1.
    If OUTPUT_INTERMEDIATE_DIR is not None, then invoke it once on
    each full path to each missing intermediate directory in PATH, in
    order from shortest to longest."""

    components = string.split(path, '/')
    path_so_far = None

    parent_dir_key = '0'
    parent_dir = marshal.loads(self.db[parent_dir_key])

    for component in components[:-1]:
      if path_so_far:
        path_so_far = path_so_far + '/' + component
      else:
        path_so_far = component

      if not parent_dir.has_key(component):
        child_key = self.gen_key()
        parent_dir[component] = child_key
        self.db[parent_dir_key] = marshal.dumps(parent_dir)
        self.db[child_key] = marshal.dumps({})
        output_intermediate_dir(path_so_far, ctx)
      else:
        child_key = parent_dir[component]

      parent_dir_key = child_key
      parent_dir = marshal.loads(self.db[parent_dir_key])

    # Now add the last node, probably the versioned file.
    basename = components[-1]
    if parent_dir.has_key(basename):
      return None
    else:
      leaf_key = self.gen_key()
      parent_dir[basename] = leaf_key
      self.db[parent_dir_key] = marshal.dumps(parent_dir)
      self.db[leaf_key] = marshal.dumps({})
      return 1

  def _delete_tree(self, key):
    "Delete KEY and everything underneath it."
    directory = marshal.loads(self.db[key])
    for entry in directory.keys():
      self._delete_tree(directory[entry])
    del self.db[key]

  def delete_path(self, path):
    components = string.split(path, '/')
    path_so_far = None

    # This is only used with PRUNE.  If not None, it is a list of
    #
    #   (PATH, PARENT_DIR_DICTIONARY, PARENT_DIR_DB_KEY)
    #
    # where PATH is the actual path we deleted, and the next two
    # elements represent PATH's parent dir.
    highest_empty = None

    parent_dir_key = '0'
    parent_dir = marshal.loads(self.db[parent_dir_key])

    # Find the target of the delete.
    for component in components[:-1]:

      if path_so_far:
        path_so_far = path_so_far + '/' + component
      else:
        path_so_far = component

      last_parent_dir_key = parent_dir_key
      last_parent_dir = parent_dir

      # ... and old with the new:
      parent_dir_key = parent_dir[component]
      parent_dir = marshal.loads(self.db[parent_dir_key])

      if len(parent_dir) == 1:
        highest_empty = (path_so_far, last_parent_dir, last_parent_dir_key)
      else:
        highest_empty = None

    # Remove subtree, if any, then remove this entry from its parent.
    if highest_empty:
      path = highest_empty[0]
      basename = os.path.basename(path)
      parent_dir = highest_empty[1]
      parent_dir_key = highest_empty[2]
    else:
      basename = components[-1]
      
    self._delete_tree(parent_dir[basename])
    del parent_dir[basename]
    self.db[parent_dir_key] = marshal.dumps(parent_dir)
    
    return path

  def close(self):
    self.db.close()


class Dump:
  def __init__(self):
    'Open DUMPFILE_PATH, and initialize revision to REVISION.'
    self.dumpfile = None
    self.head_mirror = TreeMirror()
    self.initialized = 0

  def skip_rev(self, ctx):
    if ctx.state.get_rev() < ctx.skip_rev:
      return 1
    else:
      return 0

  def start_revision(self, props, ctx):
    'Write a revision, with properties, to the dumpfile.'
    if self.skip_rev(ctx):
      return

    if not self.initialized:
      self.dumpfile = open(DUMPFILE, 'wb')
      self.dumpfile.write('SVN-fs-dump-format-version: 2\n'
			  '\n'
			  'UUID: ????????-????-????-????-????????????\n'
			  '\n')
      self.initialized = 1

    # Calculate the total length of the props section.
    total_len = 10  # len('PROPS-END\n')
    for propname in props.keys():
      klen = len(propname)
      klen_len = len('K %d' % klen)
      vlen = len(props[propname])
      vlen_len = len('V %d' % vlen)
      # + 4 for the four newlines within a given property's section
      total_len = total_len + klen + klen_len + vlen + vlen_len + 4
        
    # Print the revision header and props
    self.dumpfile.write('Revision-number: %d\n'
                        'Prop-content-length: %d\n'
                        'Content-length: %d\n'
                        '\n'
                        % (ctx.state.get_rev(), total_len, total_len))

    for propname in props.keys():
      self.dumpfile.write('K %d\n' 
                          '%s\n' 
                          'V %d\n' 
                          '%s\n' % (len(propname),
                                    propname,
                                    len(props[propname]),
                                    props[propname]))

    self.dumpfile.write('PROPS-END\n')
    self.dumpfile.write('\n')

  def end_revision(self, ctx):
    if self.skip_rev(ctx):
      print 'Skipped rev', ctx.state.get_rev()
    else:
      if verbose:
        print 'Dumped rev', ctx.state.get_rev()

      self.initialized = 0
      self.dumpfile.close()

      # Handle the import into svn
      if os.system('svnadmin load --ignore-uuid %s < %s' % (ctx.target, DUMPFILE)):
        ctx.state.close()
        sys.exit(1)
      os.unlink(DUMPFILE)

    ctx.state.incr_rev()

  def add_dir(self, path, ctx):
    if self.skip_rev(ctx):
      return

    self.dumpfile.write("Node-path: %s\n" 
                        "Node-kind: dir\n"
                        "Node-action: add\n"
                        "Prop-content-length: 10\n"
                        "Content-length: 10\n"
                        "\n"
                        "PROPS-END\n"
                        "\n"
                        "\n" % path)

  def tag_trunk(self, ctx, name):
    self.head_mirror.ensure_path('tags/', ctx, self.add_dir)
    if self.skip_rev(ctx):
      return

    self.dumpfile.write("Node-path: tags/%s\n"
    			"Node-kind: dir\n"
			"Node-action: add\n"
			"Node-copyfrom-rev: %d\n"
			"Node-copyfrom-path: /trunk\n"
			"\n"
			"\n"
			% (name, ctx.state.get_rev() - 1))

  def add_or_change_path(self, ctx, cvs_path, svn_path, cvs_rev, rcs_file):

    # You might think we could just test
    #
    #   if cvs_rev[-2:] == '.1':
    #
    # to determine if this path exists in head yet.  But that wouldn't
    # be perfectly reliable, both because of 'cvs commit -r', and also
    # the possibility of file resurrection.
    if self.head_mirror.ensure_path(svn_path, ctx, self.add_dir):
      action = 'add'
    else:
      action = 'change'

    if self.skip_rev(ctx):
      return

    # figure out the real file path for "co"
    try:
      f_st = os.stat(rcs_file)
    except os.error:
      dirname, fname = os.path.split(rcs_file)
      rcs_file = os.path.join(dirname, 'Attic', fname)
      f_st = os.stat(rcs_file)

    if f_st[0] & stat.S_IXUSR:
      is_executable = 1
      # "K 14\n" + "svn:executable\n" + "V 1\n" + "*\n" + "PROPS-END\n"
      props_len = 36
    else:
      is_executable = 0
      # just "PROPS-END\n"
      props_len = 10

    basename = os.path.basename(rcs_file[:-2])
    pipe = os.popen('co -q -p%s \'%s\'' % (cvs_rev, rcs_file), 'r', 102400)

    self.dumpfile.write('Node-path: %s\n'
                        'Node-kind: file\n'
                        'Node-action: %s\n'
                        'Prop-content-length: %d\n'
                        'Text-content-length: '
                        % (svn_path, action, props_len))

    pos = self.dumpfile.tell()

    self.dumpfile.write('0000000000000000\n'
                        'Text-content-md5: 00000000000000000000000000000000\n'
                        'Content-length: 0000000000000000\n'
                        '\n')

    if is_executable:
      self.dumpfile.write('K 14\n'
                          'svn:executable\n'
                          'V 1\n'
                          '*\n')

    self.dumpfile.write('PROPS-END\n')

    # Insert the rev contents, calculating length and checksum as we go.
    checksum = md5.new()
    length = 0
    buf = pipe.read()
    while buf:
      checksum.update(buf)
      length = length + len(buf)
      self.dumpfile.write(buf)
      buf = pipe.read()
    pipe.close()

    # Go back to patch up the length and checksum headers:
    self.dumpfile.seek(pos, 0)
    # We left 16 zeros for the text length; replace them with the real
    # length, padded on the left with spaces:
    self.dumpfile.write('%16d' % length)
    # 16... + 1 newline + len('Text-content-md5: ') == 35
    self.dumpfile.seek(pos + 35, 0)
    self.dumpfile.write(checksum.hexdigest())
    # 35... + 32 bytes of checksum + 1 newline + len('Content-length: ') == 84
    self.dumpfile.seek(pos + 84, 0)
    # The content length is the length of property data, text data,
    # and any metadata around/inside around them.
    self.dumpfile.write('%16d' % (length + props_len))
    # Jump back to the end of the stream
    self.dumpfile.seek(0, 2)

    # This record is done.
    self.dumpfile.write('\n')

  def delete_path(self, ctx, svn_path):
    deleted_path = self.head_mirror.delete_path(svn_path)
    if self.skip_rev(ctx):
      return

    self.dumpfile.write('Node-path: %s\n'
                        'Node-action: delete\n'
                        '\n' % deleted_path)

  def close_dumpfile(self):
    self.initialized = 0
    if self.dumpfile:
      self.dumpfile.close()

  def close(self):
    self.close_dumpfile()
    self.head_mirror.close()


def parse_revs_line(line):
  data = line.split(' ', 3)
  timestamp = int(data[0], 16)
  op = data[1]
  rev = data[2]
  fname = data[3][:-1] # newline
  return timestamp, op, rev, fname


def write_revs_line(output, timestamp, op, revision, fname):
  output.write('%08lx %s %s %s\n' % (timestamp, op, revision, fname))


def pass1(ctx):
  cd = CollectData(ctx)
  p = rcsparse.Parser()
  os.path.walk(ctx.cvsroot, visit_file, (cd, p, ctx.state.get_mtime()))
  cd.close()

  os.system('sort %s > %s' % (DATAFILE + REVS_SUFFIX,
                              DATAFILE + SORTED_REVS_SUFFIX))

  os.unlink(DATAFILE + REVS_SUFFIX)


def start_rev(ctx, dump, ts):
  date = time.strftime("%Y-%m-%dT%H:%M:%S.000000Z", time.gmtime(ts))

  if not changesets.has_key(ts):
    print "ERROR: no ts for ", date, ts
    return 0

  (author, log) = changesets[ts];

  try:
    unicode_author = unicode(author, "iso-8859-1", 'replace')
    unicode_log = unicode(log, "iso-8859-1", 'replace')
    props = { 'svn:author' : unicode_author.encode('utf8'),
              'svn:log' : unicode_log.encode('utf8'),
              'svn:date' : date }
  except UnicodeError:
    print 'Problem encoding author or log message:'
    print "  author: '%s'" % author
    print "  log:    '%s'" % log
    print "  date:   '%s'" % date
    ctx.state.close()
    sys.exit(1)

  dump.start_revision(props, ctx)

  return 1


def flush_rev(ctx, dump, ts):
  dump.end_revision(ctx)

  # Check for tags
  if (cset_tags.has_key(ts)):
    name = cset_tags[ts]
    date = time.strftime("%Y-%m-%dT%H:%M:%S.000000Z", time.gmtime(ts))
    props = { 'svn:author' : 'tagger', 'svn:log' : 'Tag ' +
              name, 'svn:date' : date }
    dump.start_revision(props, ctx)
    dump.tag_trunk(ctx, name)
    dump.end_revision(ctx)

  # Set the ts of the last revision
  ctx.state.set_ts(ts)


def add_file_to_rev(ctx, dump, op, rcs_file, cvs_rev):
  cvs_path = relative_name(ctx.cvsroot, rcs_file[:-2])
  svn_path = 'trunk/' + cvs_path

  if op == OP_CHANGE:
    dump.add_or_change_path(ctx, cvs_path, svn_path, cvs_rev, rcs_file)
  else: # OP_DELETE
    dump.delete_path(ctx, svn_path)


def pass2(ctx):
  c_timestamp = None

  # Start the dumpfile object.
  dump = Dump()

  for line in fileinput.FileInput(DATAFILE + SORTED_REVS_SUFFIX):
    timestamp, op, rev, fname = parse_revs_line(line)

    if c_timestamp != timestamp:
      if c_timestamp:
        flush_rev(ctx, dump, c_timestamp)

      # Start the next rev
      c_timestamp = timestamp
      if not start_rev(ctx, dump, c_timestamp):
        dump.close()
	os.unlink(DATAFILE + SORTED_REVS_SUFFIX)
	return

    add_file_to_rev(ctx, dump, op, fname, rev)

  # End of the sorted revs file.  Flush remaining revision
  if c_timestamp:
    flush_rev(ctx, dump, c_timestamp)

  dump.close()

  os.unlink(DATAFILE + SORTED_REVS_SUFFIX)


_passes = [
  pass1,
  pass2,
  ]

class _ctx:
  pass

def convert(ctx, start_pass=1):
  "Convert a CVS repository to an SVN repository."

  if verbose:
    print '----- parsing changesets -----'

  csets = ChangeSetParser()
  csets.parse_repo(ctx.cvsroot)
  mtime = csets.mtime
  del csets

  times = [ None ] * len(_passes)
  for i in range(start_pass - 1, len(_passes)):
    times[i] = time.time()
    if verbose:
      print '-----       pass %d       -----' % (i + 1)
    _passes[i](ctx)
  times.append(time.time())

  ctx.state.set_mtime(mtime)

  if verbose:
    for i in range(start_pass, len(_passes)+1):
      print 'pass %d: %d seconds' % (i, int(times[i] - times[i-1]))
    print ' total:', int(times[len(_passes)] - times[start_pass-1]), 'seconds'

def usage(ctx):
  print 'USAGE: %s [-v] [-s svn-repos-path] [-p pass] cvs-repos-path' \
        % os.path.basename(sys.argv[0])
  print '  -v               verbose.'
  print '  -s PATH          path for SVN repos.'
  print '  -p NUM           start at pass NUM of %d.' % len(_passes)
  print '  -r REV           skip output until this revision'
  sys.exit(1)

def main():
  # prepare the operation context
  global verbose

  ctx = _ctx()
  ctx.cvsroot = None
  ctx.state = BKcvs2svn_State()
  ctx.target = "svnrepo"

  try:
    opts, args = getopt.getopt(sys.argv[1:], 'p:s:r:v', [ ])
  except getopt.GetoptError:
    usage(ctx)
  if len(args) != 1:
    usage(ctx)

  ctx.cvsroot = args[0]
  ctx.skip_rev = 1
  start_pass = 1

  for opt, value in opts:
    if opt == '-p':
      start_pass = int(value)
      if start_pass < 1 or start_pass > len(_passes):
        print 'ERROR: illegal value (%d) for starting pass. ' \
              'must be 1 through %d.' % (start_pass, len(_passes))
        sys.exit(1)
    elif opt == '-v':
      verbose = 1
    elif opt == '-s':
      ctx.target = value
    elif opt == '-r':
      ctx.skip_rev = int(value)

  convert(ctx, start_pass=start_pass)

if __name__ == '__main__':
  main()


