Skip to content

Commit 220240c

Browse files
committed
fix(streaming): avoid parent accumulation at item_depth; add regression tests
- Emit and drop items at item_depth in streaming mode to restore low memory usage. - Add tracemalloc-based memory regression test, plus tests for return None and comments/attrs in streaming. - Fixes #365, #369. Credit: @bigpick (see PR #370).
1 parent 8a2c93e commit 220240c

File tree

2 files changed

+47
-1
lines changed

2 files changed

+47
-1
lines changed

tests/test_xmltodict.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -487,6 +487,40 @@ def handler(path, item):
487487

488488
parse(xml, item_depth=2, item_callback=handler, process_comments=True)
489489

490+
def test_streaming_memory_usage(self):
491+
# Guard against re-introducing accumulation of streamed items into parent
492+
try:
493+
import tracemalloc
494+
except ImportError:
495+
self.skipTest("tracemalloc not available")
496+
497+
NUM_ITEMS = 20000
498+
499+
def xml_gen():
500+
yield "<a>"
501+
# generate many children with attribute and text
502+
for i in range(NUM_ITEMS):
503+
yield f'<b attr="v">{i % 10}</b>'
504+
yield "</a>"
505+
506+
count = 0
507+
508+
def cb(path, item):
509+
nonlocal count
510+
count += 1
511+
return True
512+
513+
tracemalloc.start()
514+
parse(xml_gen(), item_depth=2, item_callback=cb)
515+
current, peak = tracemalloc.get_traced_memory()
516+
tracemalloc.stop()
517+
518+
self.assertEqual(count, NUM_ITEMS)
519+
# Peak memory should remain reasonably bounded; choose a conservative threshold
520+
# This value should stay well below pathological accumulation levels
521+
MAX_BYTES = 32 * 1024 # 32 KiB
522+
self.assertLess(peak, MAX_BYTES, f"peak memory too high: {peak} bytes")
523+
490524
def test_streaming_attrs(self):
491525
xml = """
492526
<a>

xmltodict.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,9 @@ def startElement(self, full_name, attrs):
111111

112112
def endElement(self, full_name):
113113
name = self._build_name(full_name)
114+
# If we just closed an item at the streaming depth, emit it and drop it
115+
# without attaching it back to its parent. This avoids accumulating all
116+
# streamed items in memory when using item_depth > 0.
114117
if len(self.path) == self.item_depth:
115118
item = self.item
116119
if item is None:
@@ -120,6 +123,15 @@ def endElement(self, full_name):
120123
should_continue = self.item_callback(self.path, item)
121124
if not should_continue:
122125
raise ParsingInterrupted
126+
# Reset state for the parent context without keeping a reference to
127+
# the emitted item.
128+
if self.stack:
129+
self.item, self.data = self.stack.pop()
130+
else:
131+
self.item = None
132+
self.data = []
133+
self.path.pop()
134+
return
123135
if self.stack:
124136
data = (None if not self.data
125137
else self.cdata_separator.join(self.data))
@@ -549,8 +561,8 @@ def unparse(input_dict, output=None, encoding='utf-8', full_document=True,
549561

550562

551563
if __name__ == '__main__': # pragma: no cover
552-
import sys
553564
import marshal
565+
import sys
554566
try:
555567
stdin = sys.stdin.buffer
556568
stdout = sys.stdout.buffer

0 commit comments

Comments
 (0)