Lightweight xml parser

Dom parsers may consume over gigabytes of memory while parsing big amounts of xml data, sax parser works more effectively. This is an example how to use sax to transform xml into python data object.

TEST_DATASET = [{
    # no keys specified
    'xml': """
<xml>
  <data>test</data>
</xml>""",
    'keys': [],
    'lists': [],
    'data': {},
    }, {
    # nonexistent key
    'xml': """
<xml>
  <data>test</data>
</xml>""",
    'keys': ['xml.data', 'xml.nonexistent'],
    'lists': [],
    'data': {'xml': {'data': 'test'}},
    }, {
    # attributes
    'xml': """
<xml>
  <data id="100"/>
</xml>""",
    'keys': ['xml.data.id'],
    'lists': [],
    'data': {'xml': {'data': {'id': "100"}}},
    }, {
    # lists
    'xml': """
<xml>
  <data>
    <item id="1">
        <color>purple</color>
    </item>
    <item id="2">
        <color>cyan</color>
    </item>
  </data>
</xml>""",
    'keys': ['xml.data.item.id', 'xml.data.item.color'],
    'lists': ['xml.data.item'],
    'data': {
        'xml': {
            'data': {
                'item': [
                    {'color': 'purple', 'id': '1'},
                    {'color': 'cyan', 'id': '2'}
                ]
            }
        }
    }},
]

Gist: https://gist.github.com/nanvel/f944eae1f02d47b6d6a4

from xml import sax


class XMLParser(sax.handler.ContentHandler):

    def __init__(self, keys=[], lists=[], *args, **kwargs):
        """
        :param keys: list of data keys have to be available in data
        :param lists: list of nodes have be represented as list

        :example keys: 'messages.status', 'home.categories.item.label'
        :example lists: 'home.categories.item'
        """
        sax.handler.ContentHandler.__init__(self, *args, **kwargs)
        self.keys = keys
        self.lists = lists
        self.data = {}
        self.current_path = ''
        self.current_attrs = {}
        self.content = ''
        # short keys - keys without last element (used to parse attributes)
        self.short_keys = []
        for key in keys:
            parts = key.split('.')
            if len(parts) < 2:
                continue
            self.short_keys.append('.'.join(parts[:-1]))

    def startDocument(self):
        pass

    def startElement(self, name, attrs):
        self.current_attrs[name] = attrs
        if self.current_path:
            self.current_path += '.'
        self.current_path += name
        if self.current_path in self.lists:
            # create new item in list
            block = self.data
            path = self.current_path.split('.')
            for i, p in enumerate(path):
                if i == len(path) - 1:
                    if p not in block:
                        block[p] = [{}]
                    else:
                        block[p].append({})
                    break
                if '.'.join(path[:i + 1]) in self.lists:
                    block = block[p][-1]
                else:
                    if p not in block:
                        block[p] = {}
                    block = block[p]

    def characters(self, content):
        self.content += content

    def addValue(self, path_str, value):
        block = self.data
        path = path_str.split('.')
        for i, p in enumerate(path):
            if i == len(path) - 1:
                block[p] = value
                break
            if '.'.join(path[:i + 1]) in self.lists:
                block = block[p][-1]
            else:
                if p not in block:
                    block[p] = {}
                block = block[p]

    def endElement(self, name):
        if self.current_path in self.keys:
            self.addValue(self.current_path, self.content.strip())
        elif self.current_path in self.short_keys:
            # parse attributes
            attrs = self.current_attrs[self.current_path.split('.')[-1]]
            for k in attrs.keys():
                path = '{path}.{attr}'.format(path=self.current_path, attr=k)
                if path in self.keys:
                    self.addValue(path, attrs[k])
        self.current_path = '.'.join(
            self.current_path.split('.')[:-1])
        self.content = ''

    def endDocument(self):
        pass


TEST_DATASET = [{
    # no keys specified
    'xml': """
<xml>
  <data>test</data>
</xml>""",
    'keys': [],
    'lists': [],
    'data': {},
    }, {
    # nonexistent key
    'xml': """
<xml>
  <data>test</data>
</xml>""",
    'keys': ['xml.data', 'xml.nonexistent'],
    'lists': [],
    'data': {'xml': {'data': 'test'}},
    }, {
    # attributes
    'xml': """
<xml>
  <data id="100"/>
</xml>""",
    'keys': ['xml.data.id'],
    'lists': [],
    'data': {'xml': {'data': {'id': "100"}}},
    }, {
    # lists
    'xml': """
<xml>
  <data>
    <item id="1">
        <color>purple</color>
    </item>
    <item id="2">
        <color>cyan</color>
    </item>
  </data>
</xml>""",
    'keys': ['xml.data.item.id', 'xml.data.item.color'],
    'lists': ['xml.data.item'],
    'data': {
        'xml': {
            'data': {
                'item': [
                    {'color': 'purple', 'id': '1'},
                    {'color': 'cyan', 'id': '2'}
                ]
            }
        }
    }},
]


if __name__ == '__main__':
    for test_data in TEST_DATASET:
        parser = XMLParser(
            keys=test_data['keys'],
            lists=test_data['lists'])
        sax.parseString(test_data['xml'], parser)
        assert parser.data == test_data['data'], '{0} != {1}'.format(
            parser.data, test_data['data'])

Licensed under CC BY-SA 3.0