RFC 4180対応版 CSVレコードの分解
Posted feedbacks - OCaml
自前パースの戦略です。
OCamlではこの手のパースはストリームパーサを利用すると少しだけ楽かもしれません。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | (* strの中にcharはあるか? *)
let mem_string char str =
try
let _ =
String.index str char
in
true
with
Not_found ->
false
(* 一文字バッファに貯めて次へ *)
let next buf strm loop =
match Stream.peek strm with
Some c ->
Buffer.add_char buf (Stream.next strm);
loop strm
| _ ->
if Buffer.length buf > 0 then
Buffer.contents buf
else
raise Stream.Failure
(* 区切り文字が出るまでバッファに文字を貯めていく *)
let rec until_sep ?(buf = Buffer.create 80) strm =
match Stream.peek strm with
Some c when mem_string c ",\r\n" ->
Buffer.contents buf
| _ ->
next buf strm (until_sep ~buf)
(* 括り文字が出るまでバッファに文字を貯めていく *)
let rec until_quote ?(buf = Buffer.create 80) strm =
match Stream.peek strm with
Some c when c = '"' -> begin
match Stream.npeek 2 strm with
'"' :: '"' :: [] ->
Buffer.add_char buf (Stream.next strm);
Stream.junk strm;
until_quote ~buf strm
| _ ->
Buffer.contents buf
end
| _ ->
next buf strm (until_quote ~buf)
(* 一つのフィールドを認識。括られている奴と括られていない奴 *)
let parse_field = parser
[< 'fq when fq = '"'; field = until_quote; 'sq when sq = '"' >] ->
field
| [< field = until_sep >] ->
field
(* フィールドを切り取りつつ表示 *)
let _ =
let print_field =
let counter =
ref 1
in
fun str ->
Printf.printf "%d => %s\n" !counter str;
incr counter
in
let rec parse = parser
[< field = parse_field; strm >] ->
print_field field;
begin match strm with parser
[< 'c when c = ','; rest >] ->
parse rest
| [< >] ->
()
end
| [< >] ->
()
in
parse (Stream.of_string "\"aaa\",\"b\nbb\",\"ccc\",zzz,\"y\"\"Y\"\"y\",xxx\n")
|
Genlex という OCaml に付属の簡易字句解析モジュールを使ってみました。
非常に手抜きな作りなので、サンプルは正しく出力できますが、きちんと CSV に対応はしていません。
非常に手抜きな作りなので、サンプルは正しく出力できますが、きちんと CSV に対応はしていません。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 | open Genlex
let string_of_token token =
match token with
| Ident s
| String s -> s
| Int i -> string_of_int i
| Float f -> string_of_float f
| Char c -> String.make 1 c
| _ -> failwith "not use Kwd"
let columns_of_tokens str =
let tokens = Genlex.make_lexer [","] (Stream.of_string str) in
let peek () = Stream.peek tokens
and junk () = Stream.junk tokens in
let rec loop acc =
match peek () with
| None -> List.rev acc
| Some (Kwd _) ->
junk ();
loop acc
| Some token ->
junk ();
let column = string_of_token token in
let rec concat col =
match peek () with
| None
| Some (Kwd _) -> col
| Some tok ->
junk ();
concat (col ^ "\"" ^ (string_of_token tok))
in
loop ((concat column) :: acc)
in
loop []
let parse_and_print str =
match columns_of_tokens str with
| [] -> print_newline ()
| x::xs ->
Printf.printf "1 => %s\n" x;
ignore begin
List.fold_left begin fun index str ->
Printf.printf "%d => %s\n" index str;
succ index
end 2 xs
end
let main () =
let sample =
match Sys.argv with
| [|_; input |] -> input
| _ -> "\"aaa\",\"b\nbb\",\"ccc\",zzz,\"y\"\"Y\"\"y\",xxx"
in
parse_and_print sample
let () = if not !Sys.interactive then main ()
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | (* ocaml camlp4rf.cma もしくは
ocamlc -pp "camlp4rf" a.ml *)
value add_opened_csv_string =
let rec loop buf = parser
[ [: `'"'; st:] ->
if (Stream.peek st <> Some '"') then ()
else (Buffer.add_char buf (Stream.next st); loop buf st)
| [: `c ; st :] -> (Buffer.add_char buf c; loop buf st) ]
in fun buf st -> loop buf st;
value record_iteri =
let use_buffer f buf =
(f (Buffer.contents buf); Buffer.clear buf) in
let rec loop f pos buf = parser
[ [: `','; st:] -> (use_buffer (f pos) buf; loop f (pos+1) buf st)
| [: `'\n'; st:] -> use_buffer (f pos) buf
| [: `'"'; st:] -> (add_opened_csv_string buf st; loop f pos buf st)
| [: `c ; st:] -> (Buffer.add_char buf c; loop f pos buf st)
| [: :] -> use_buffer (f pos) buf ]
in fun st f buf -> loop f 1 buf st;
(*
value t = Stream.of_string "\
\"aaa\",\"b\n\
bb\",\"ccc\",zzz,\"y\"\"Y\"\"y\",xxx";
record_iteri t (Printf.printf "%2d => %s\n") (Buffer.create 8);
*)
|


raynstard
#3389()
Rating1/1=1.00
[ reply ]