====== Usage ======
$ python TERC.py TERC.xml TERC.csv
====== Code ======
import sys
import csv, codecs, cStringIO
from lxml import etree
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
xml = etree.parse(open(sys.argv[1]))
catalog = xml.getroot()[0]
with open(sys.argv[2],'wb') as fp:
writer = UnicodeWriter(fp)
writer.writerows(map(lambda x: map(lambda x: x.text or '',x), catalog))
====== Wynik ======
Dla {{:it:terc.xml|TERC.xml}} z http://www.stat.gov.pl/broker/access/prefile/listPreFiles.jspa dało {{:it:terc.csv|TERC.csv}} o wiele lżej niż oficjalny [[http://www.stat.gov.pl/bip/389_112_PLK_HTML.htm|program do konwersji]].
~~DISCUSSION~~