RFC 4180対応版 CSVレコードの分解
Posted feedbacks - Python
これでいいのかな?
1 2 3 4 5 6 7 8 9 | import csv
from StringIO import StringIO
data = """"aaa","b
bb","ccc",zzz,"y""Y""y",xxx"""
for row in csv.reader(StringIO(data)):
for i,v in enumerate(row):
print i+1, "=>", v
|
csvモジュールを使うのが常道だと思いますが、あえて自前で処理してみました。cStringIOは初めて使いましたが、undoみたいな処理をするには便利かも。(seek の前の if c: が必要なのに気づかず少しはまりましたが)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | import cStringIO
import os
def split_csv(csv):
io = cStringIO.StringIO(csv)
while not io.closed:
def f():
quoted = False
while 1:
c = io.read(1)
if not c: # eof
io.close()
break
elif not(quoted) and c == ',':
break
elif not(quoted) and c in ('\r', '\n'):
io.close() # ignore second record
break
elif c == '"':
c = io.read(1)
if c == '"':
yield c
else:
quoted = not(quoted)
if c:
io.seek(-1, os.SEEK_CUR)
else:
yield c
if quoted:
raise ValueError("unterminated quotation")
yield "".join(f())
def main():
for i, s in enumerate(split_csv("""\
"aaa","b
bb","ccc",zzz,"y""Y""y",xxx
""")):
print "%d => %s" % (i + 1, s)
if __name__ == '__main__':
main()
|
お題に対応する最小限のチェックをしたつもりですが 入力文字列によっては見逃しがあるかも。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | def splitCVS(s):
a = []
b = s.split(',')
while b:
c = b.pop(0)
while c.startswith('"') and c.count('"') % 2:
c += b.pop(0)
if c.startswith('"') and c.endswith('"'):
c = c[1:-1].replace('""', '"')
elif c.find('"') != -1 or c.find('\n') != -1:
raise 'invalid'
a.append(c)
return a
l = splitCVS('"abc","b\nbb","cc,c",zzz,"y""Y""y",xxx')
for i, s in zip(range(len(l)), l):
print '%2d = %s' % (i+1, s)
|
csvをつかっても面白くないので正規表現でやってみました。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | import re
quoted = r'("((""|[^"])+)")'
naked = r'([^,"\n]+)'
enclosed = r'("(?P<enclosed>' + naked + r')")'
record = quoted + '|' + enclosed + '|' + naked
r = re.compile(record)
def unescape(s):
return re.sub('["](?!")', '', s)
def parse(s):
for i, t in enumerate(r.finditer(s)):
print i+1, '=>', unescape(s[t.start():t.end()])
parse('''"aaa","b
bb","ccc",zzz,"y""Y""y",xxx''')
|
""""の処理がおかしかった。
unescapeが美しくない。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 | '''
>>> parse('aaa')
1 => aaa
>>> parse('"aaa"')
1 => aaa
>>> parse('"a\\naa"')
1 => a\naa
>>> parse('"a""aa"')
1 => a"aa
>>> parse('"a""""aa"')
1 => a""aa
>>> parse('aaa,bbb')
1 => aaa
2 => bbb
>>> parse('aaa, bbb')
1 => aaa
2 => bbb
>>> parse('aaa,"b\\nbb"')
1 => aaa
2 => b\nbb
>>> parse('aaa,"b\\n""bb"')
1 => aaa
2 => b\n"bb
'''
import re
quoted = r'("((""|[^"])+)")'
naked = r'([^,"\n]+)'
enclosed = r'("(?P<enclosed>' + naked + r')")'
record = quoted + '|' + enclosed + '|' + naked
r = re.compile(record)
def unescape(s):
if s.startswith('"'):
return re.sub('""', r'"', s[1:-1])
else:
return re.sub('""', r'"', s)
def parse(s):
for i, t in enumerate(r.finditer(s)):
print i+1, '=>', unescape(s[t.start():t.end()])
parse('''"aaa","b
b""b","ccc",zzz,"y""Y""y",xxx''')
import doctest
doctest.testmod()
|
名前つきの正規表現を使ってみた。
get orの連続とrecordの定義が重複していて美しくない。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 | '''
>>> parse('aaa')
1 => aaa
>>> parse('"aaa"')
1 => aaa
>>> parse('"a\\naa"')
1 => a\naa
>>> parse('"a""aa"')
1 => a"aa
>>> parse('"a""""aa"')
1 => a""aa
>>> parse('aaa,bbb')
1 => aaa
2 => bbb
>>> parse('aaa, bbb')
1 => aaa
2 => bbb
>>> parse('aaa,"b\\nbb"')
1 => aaa
2 => b\nbb
>>> parse('aaa,"b\\n""bb"')
1 => aaa
2 => b\n"bb
'''
import re
quoted = r'("(?P<quoted>(""|[^"])+)")'
naked = r'[^,"\n]+'
enclosed = r'("(' + '?P<enclosed>' + naked + '' + r')")'
record = quoted + '|' + enclosed + '|' + '(?P<naked>' + naked + ')'
r = re.compile(record)
def unescape(s):
return re.sub('""', r'"', s)
def parse(s):
for i, t in enumerate(r.finditer(s)):
#print i+1, '=>', unescape(s[t["body"].start:t["body"].end])
d= t.groupdict()
print i+1, '=>', unescape(d.get('naked') or d.get('enclosed') or d.get('quoted'))
parse('''"aaa","b
b""b","ccc",zzz,"y""Y""y",xxx''')
import doctest
doctest.testmod()
|






raynstard
#3389()
Rating1/1=1.00
[ reply ]