import xml.sax
import xml.sax.handler

"""ifarchivexml:

This module parses the Master-Index.xml file that is available at
<http://www.ifarchive.org/indexes/Master-Index.xml>.

You can use this module like this:
  import ifarchivexml
  (root, dirs, files) = ifarchivexml.parse('Master-Index.xml')

root is an IFDir object representing the root directory ('if-archive').
dirs is a dictionary mapping directory names ('if-archive/games', for
example) to IFDir objects. files is a dictionary mapping file pathname
('if-archive/games/playgame.FAQ', for example) to IFFile objects. 

You can display the contents of either an IFDir or IFFile object with
the obj.dump() method.

There is also a callback form:
  ifarchivexml.parse_callback('Master-Index.xml', dirfunc=FUNC, filefunc=FUNC)

The parse_callback() function returns nothing; it calls the given functions
on each IFDir and IFFile as they are encountered. (You don't have to
supply both callbacks.) Master-Index.xml is conventionally created in
directory-tree order, so your callbacks will encounter parents before
children. In this mode, the parentobj and directoryobj fields of IFDir and
IFFile will not be set.

Dec 2019: Updated to Python 3; added sha512 and metadata fields.
Apr 2025: Added parentdesc field; support date and metadata fields for
  directories; removed xdir field. Added the parse_callback() form.
"""

CONTEXT_NONE = 0
CONTEXT_DIR = 1
CONTEXT_FILE = 2
CONTEXT_DIRLINK = 3
CONTEXT_FILELINK = 4
CONTEXT_METADATA = 5
CONTEXT_METAITEM = 5

class IFDir:
    description = None
    date = None
    rawdate = None
    metadata = None
    def __init__(self):
        self.subdirs = []
        self.files = []
        self.parentdescs = {}
    def __repr__(self):
        return '<IFDir \'' + self.name + '\'>'
    def dump(self):
        print('name:   ', self.name)
        print('parent: ', self.parent, ('('+str(self.parentobj)+')'))
        print('subdircount:', self.subdircount)
        print('filecount:  ', self.filecount)
        if (self.metadata is not None):
            print('metadata:')
            for (key, valls) in self.metadata.items():
                print(' ', key+':', ', '.join(valls))
        if (self.description is not None):
            print('description:')
            print(self.description)
        for key in self.parentdescs:
            print('parentdesc (from %s)' % (key,))
            print(self.parentdescs[key])
        print('subdirs:')
        for subdir in self.subdirs:
            print(' ', str(subdir))
        print('files:')
        for file in self.files:
            print(' ', str(file))

class IFFile:
    size = None
    date = None
    md5 = None
    sha512 = None
    rawdate = None
    symlink = None
    metadata = None
    description = None
    def __init__(self):
        self.parentdescs = {}
    def __repr__(self):
        return '<IFFile \'' + self.path + '\'>'
    def dump(self):
        print('path:   ', self.path)
        print('name:   ', self.name)
        print('directory: ', self.directory, ('('+str(self.directoryobj)+')'))
        if (self.symlink == 'dir'):
            print('symlink to dir:')
            print('  name: ', self.symlinkname)
        if (self.symlink == 'file'):
            print('symlink to file:')
            print('  path: ', self.symlinkpath)
        print('size:   ', self.size)
        print('date:   ', self.date)
        print('rawdate:', self.rawdate)
        print('md5:    ', self.md5)
        print('sha512: ', self.sha512)
        print('orderindex:', self.orderindex)
        if (self.metadata is not None):
            print('metadata:')
            for (key, valls) in self.metadata.items():
                print(' ', key+':', ', '.join(valls))
        if (self.description is not None):
            print('description:')
            print(self.description)
        for key in self.parentdescs:
            print('parentdesc (from %s)' % (key,))
            print(self.parentdescs[key])

class IFAParser(xml.sax.handler.ContentHandler):
    def __init__(self, callbacks=None):
        xml.sax.ContentHandler.__init__(self)
        if not callbacks:
            self.callbackmode = False
            self.directories = {}
            self.files = {}
        else:
            self.callbackmode = True
            self.dircallback = callbacks[0]
            self.filecallback = callbacks[1]
            self.directories = None
            self.files = None
        self.grabbeddata = ''
        self.curdir = None
        self.curfile = None
        self.curitem = None
        self.curmetaowner = None
        self.orderindex = 0
        self.context = CONTEXT_NONE
        self.elements = {
            'ifarchive': (self.ignore_start, self.ifarchive_end),
            'directory': (self.directory_start, self.directory_end),
            'file': (self.file_start, self.file_end),
            'metadata': (self.metadata_start, self.metadata_end),
            'item': (self.item_start, self.item_end),
            'key': (self.grabdata_start, self.key_end),
            'value': (self.grabdata_start, self.value_end),
            'name': (self.grabdata_start, self.name_end),
            'filecount': (self.grabdata_start, self.filecount_end),
            'subdircount': (self.grabdata_start, self.subdircount_end),
            'parent': (self.grabdata_start, self.parent_end),
            'path': (self.grabdata_start, self.path_end),
            'size': (self.grabdata_start, self.size_end),
            'date': (self.grabdata_start, self.date_end),
            'rawdate': (self.grabdata_start, self.rawdate_end),
            'md5': (self.grabdata_start, self.md5_end),
            'sha512': (self.grabdata_start, self.sha512_end),
            'description': (self.grabdata_start, self.description_end),
            'parentdesc': (self.parentdesc_start, self.parentdesc_end),
            'symlink': (self.symlink_start, self.symlink_end),
        }
        
    def characters(self, data):
        self.grabbeddata = (self.grabbeddata + data)

    def startElement(self, name, attrs):
        if (name not in self.elements):
            return
        (startfunc, endfunc) = self.elements.get(name)
        startfunc(attrs)

    def endElement(self, name):
        if (name not in self.elements):
            return
        (startfunc, endfunc) = self.elements.get(name)
        endfunc()

    def ignore_start(self, dict):
        pass
    def ignore_end(self):
        pass

    def grabdata_start(self, dict):
        self.grabbeddata = ''
    def grabdata(self):
        dat = self.grabbeddata
        self.grabbeddata = ''
        return dat

    def directory_start(self, dict):
        if (self.context == CONTEXT_NONE):
            self.curdir = IFDir()
            self.context = CONTEXT_DIR
        elif (self.context == CONTEXT_FILE):
            self.grabdata_start(None)

    def directory_end(self):
        if (self.context == CONTEXT_DIR):
            name = self.curdir.name
            if self.callbackmode:
                self.dircallback(self.curdir)
            else:
                self.directories[name] = self.curdir
            self.curdir = None
            self.context = CONTEXT_NONE
        elif (self.context == CONTEXT_FILE):
            data = self.grabdata()
            if (self.curfile is not None):
                self.curfile.directory = data

    def file_start(self, dict):
        if (self.context == CONTEXT_NONE):
            self.curfile = IFFile()
            self.context = CONTEXT_FILE

    def file_end(self):
        if (self.context == CONTEXT_FILE):
            path = self.curfile.path
            self.curfile.orderindex = self.orderindex
            self.orderindex = self.orderindex+1
            if self.callbackmode:
                self.filecallback(self.curfile)
            else:
                self.files[path] = self.curfile
            self.curfile = None
            self.context = CONTEXT_NONE

    def metadata_start(self, dict):
        if (self.context == CONTEXT_FILE):
            self.curmetaowner = self.curfile
            self.curfile.metadata = {}
            self.context = CONTEXT_METADATA
        elif (self.context == CONTEXT_DIR):
            self.curmetaowner = self.curdir
            self.curdir.metadata = {}
            self.context = CONTEXT_METADATA

    def metadata_end(self):
        if (self.context == CONTEXT_METADATA):
            if self.curmetaowner is self.curfile:
                self.context = CONTEXT_FILE
            elif self.curmetaowner is self.curdir:
                self.context = CONTEXT_DIR
            else:
                raise Exception()
        self.curmetaowner = None

    def item_start(self, dict):
        if (self.context == CONTEXT_METADATA):
            self.curitem = [None]
            self.context = CONTEXT_METAITEM

    def item_end(self):
        if (self.context == CONTEXT_METAITEM):
            if self.curitem[0] and len(self.curitem) > 1:
                self.curmetaowner.metadata[self.curitem[0]] = self.curitem[1:]
            self.curitem = None
            self.context = CONTEXT_METADATA

    def key_end(self):
        if (self.context == CONTEXT_METAITEM):
            val = self.grabdata()
            if (self.curitem is not None):
                self.curitem[0] = val
                
    def value_end(self):
        if (self.context == CONTEXT_METAITEM):
            val = self.grabdata()
            if (self.curitem is not None):
                self.curitem.append(val)
                
    def symlink_start(self, dict):
        if (self.context == CONTEXT_FILE):
            if (dict['type'] == 'dir'):
                self.context = CONTEXT_DIRLINK
                self.curfile.symlink = 'dir'
            else:
                self.context = CONTEXT_FILELINK
                self.curfile.symlink = 'file'

    def symlink_end(self):
        if (self.context == CONTEXT_DIRLINK):
            self.context = CONTEXT_FILE
        elif (self.context == CONTEXT_FILELINK):
            self.context = CONTEXT_FILE

    def name_end(self):
        if (self.context == CONTEXT_DIR):
            name = self.grabdata()
            if (self.curdir is not None):
                self.curdir.name = name
        elif (self.context == CONTEXT_FILE):
            name = self.grabdata()
            if (self.curfile is not None):
                self.curfile.name = name
        elif (self.context == CONTEXT_DIRLINK):
            name = self.grabdata()
            if (self.curfile is not None):
                self.curfile.symlinkname = name

    def parent_end(self):
        if (self.context == CONTEXT_DIR):
            data = self.grabdata()
            if (self.curdir is not None):
                self.curdir.parent = data

    def subdircount_end(self):
        if (self.context == CONTEXT_DIR):
            data = self.grabdata()
            if (self.curdir is not None):
                self.curdir.subdircount = int(data)

    def filecount_end(self):
        if (self.context == CONTEXT_DIR):
            data = self.grabdata()
            if (self.curdir is not None):
                self.curdir.filecount = int(data)

    def path_end(self):
        if (self.context == CONTEXT_FILE):
            data = self.grabdata()
            if (self.curfile is not None):
                self.curfile.path = data
        elif (self.context == CONTEXT_FILELINK):
            data = self.grabdata()
            if (self.curfile is not None):
                self.curfile.symlinkpath = data

    def size_end(self):
        if (self.context == CONTEXT_FILE):
            data = self.grabdata()
            if (self.curfile is not None):
                self.curfile.size = int(data)

    def date_end(self):
        if (self.context == CONTEXT_FILE):
            data = self.grabdata()
            if (self.curfile is not None):
                self.curfile.date = data
        elif (self.context == CONTEXT_DIR):
            data = self.grabdata()
            if (self.curdir is not None):
                self.curdir.date = data

    def rawdate_end(self):
        if (self.context == CONTEXT_FILE):
            data = self.grabdata()
            if (self.curfile is not None):
                self.curfile.rawdate = int(data)
        elif (self.context == CONTEXT_DIR):
            data = self.grabdata()
            if (self.curdir is not None):
                self.curdir.rawdate = int(data)

    def md5_end(self):
        if (self.context == CONTEXT_FILE):
            data = self.grabdata()
            if (self.curfile is not None):
                self.curfile.md5 = data

    def sha512_end(self):
        if (self.context == CONTEXT_FILE):
            data = self.grabdata()
            if (self.curfile is not None):
                self.curfile.sha512 = data

    def parentdesc_start(self, dict):
        if (self.context == CONTEXT_DIR or self.context == CONTEXT_FILE):
            self.grabbeddata = ''
            self.curitem = dict['dir']
        
    def parentdesc_end(self):
        if (self.context == CONTEXT_DIR):
            data = self.grabdata()
            if (self.curdir is not None):
                self.curdir.parentdescs[self.curitem] = data
            self.curitem = None
        elif (self.context == CONTEXT_FILE):
            data = self.grabdata()
            if (self.curfile is not None):
                self.curfile.parentdescs[self.curitem] = data
            self.curitem = None
                
    def description_end(self):
        if (self.context == CONTEXT_DIR):
            data = self.grabdata()
            if (self.curdir is not None):
                self.curdir.description = data
        elif (self.context == CONTEXT_FILE):
            data = self.grabdata()
            if (self.curfile is not None):
                self.curfile.description = data

    def ifarchive_end(self):
        if not self.callbackmode:
            for dir in self.directories.values():
                parent = dir.parent
                if (parent == ''):
                    dir.parentobj = None
                else:
                    dir.parentobj = self.directories[parent]
                    dir.parentobj.subdirs.append(dir)
            for file in self.files.values():
                parent = file.directory
                file.directoryobj = self.directories[parent]
                file.directoryobj.files.append(file)

def parse(filename):
    parser = IFAParser()

    fl = open(filename, 'r')
    xml.sax.parse(fl, parser)
    fl.close()

    rootdir = parser.directories['if-archive']
    result = (rootdir, parser.directories, parser.files)
    return result

def parse_callback(filename, dirfunc=None, filefunc=None):
    if not dirfunc:
        dirfunc = lambda obj: None
    if not filefunc:
        filefunc = lambda obj: None
        
    parser = IFAParser(callbacks=(dirfunc, filefunc))

    fl = open(filename, 'r')
    xml.sax.parse(fl, parser)
    fl.close()

