challenge RFC 4180対応版 CSVレコードの分解

ある関数(splitCSV)に渡された文字列を配列に分解して列ごとに表示してください。
渡される文字列は、CSVデータの1レコードが設定されているとします。

使用するデータはK3形式が元になっている仕様で
エクセルが出力しているような形式です。

書式には次のような特徴があります。
1. 各レコードは「改行」によって区切られている。
2. 各列は「,」によって区切られている。
3. 列のデータは「"」によって囲んでも良い。
4. 列に「,」「改行」「"」いずれかを含む場合「"」で
   囲わなければならない。
5. 列データに「"」を含める場合「""」とする。

本来、改行コードはCRLFですが今回は特に指定しません。

次の入力があった場合
"aaa","b
bb","ccc",zzz,"y""Y""y",xxx

出力は
1 => aaa
2 => b
bb
3 => ccc
4 => zzz
5 => y"Y"y
6 => xxx

となります。
このお題はraynstardさんの投稿によるものです。ご投稿ありがとうございます。助かります。

Posted feedbacks - Haskell

まずは愚直にparseしてみた。 mapM putStrLn のあたりが気持ち悪い? showField中で出力しちゃう方がいいんだろうか。
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
import List

parseF [] r        = reverse r
parseF ('\"':cs) r = parseE cs [] r
parseF cs r        = parseN cs [] r

parseE [] f r             = parseF [] $ reverse f:r
parseE ('\"':',':cs) f r  = parseF cs $ reverse f:r
parseE ('\"':'\n':cs) f r = parseF cs $ reverse f:r
parseE ('\"':'\"':cs) f r = parseE cs ('\"':f) r
parseE ('\"':c:cs) f r    = parseE ('\"':cs) f r -- should be an error?
parseE (c:cs) f r         = parseE cs (c:f) r

parseN [] f r        = parseF [] $ reverse f:r
parseN ('\n':cs) f r = parseF [] $ reverse f:r
parseN (',':cs) f r  = parseF cs $ reverse f:r
parseN (c:cs) f r    = parseN cs (c:f) r

splitCVS record = mapM putStrLn $ snd $ mapAccumL showField 1 $ parseF record []
  where showField i f = (i+1, (show i)++" => "++f)

main = do splitCVS "\"aaa\",\"b\nbb\",\"ccc\",zzz,\"y\"\"Y\"\"y\",xxx\n"; return ()

つづいてParsecに挑戦
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
import Text.ParserCombinators.Parsec

record = do fields <- sepBy field (char ',')
            char '\n'
            return fields

field = between (char '\"') (char '\"') quotedField
    <|> many (noneOf ",\n")

quotedField = many $ (try $ do string "\"\""; return '\"') <|> noneOf "\""

splitCVS line = case (runParser record () "" line) of
                 Left err     -> print err
                 Right fields -> do mapM putStrLn fields; return ()

main = splitCVS "\"aaa\",\"b\nbb\",\"ccc\",zzz,\"y\"\"Y\"\"y\",xxx\n"

Parsec の出番!UnitTest 付きで。
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import Text.ParserCombinators.Parsec
import Test.HUnit

line :: Parser [String]
line = do columns <- sepBy1 column comma
          optional (char '\n')
          return columns

column :: Parser String
column = do b <- char '"'
            c <- many ((satisfy (/= '"')) <|> try escapedQuote)
            d <- char '"'
            return c
         <|>
         many1 (noneOf [',', '"', '\n'])
         <?> "escapedQuote"

escapedQuote :: Parser Char
escapedQuote = do string "\"\"" <?> "escapedQuote"
                  return '"'

comma :: Parser ()
comma = skipMany1 (char ',' <?> "comma")

splitCSV :: String -> [String]
splitCSV s = case (parse line "" s) of
             Left err -> error ("parse error at " ++ (show err))
             Right x -> x

testData :: [Test]
testData = [
            ["abc", "def"] ~=? splitCSV "abc,def\n",
            ["abc", "def"] ~=? splitCSV "\"abc\",def",
            ["a,bc", "def"] ~=? splitCSV "\"a,bc\",\"def\"\n",
            ["abc", "b\nbb", "ccc", "zzz", "y\"Y\"y", "xxx"] ~=? splitCSV "\"abc\",\"b\nbb\",\"ccc\",zzz,\"y\"\"Y\"\"y\",xxx\n"
           ]

main :: IO Counts
main = runTestTT (test testData)

あ、カラム番号とともに出力するところまでが課題なのね。
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import Text.ParserCombinators.Parsec
import Test.HUnit

line :: Parser [String]
line = do columns <- sepBy1 column comma
          optional (char '\n')
          return columns

column :: Parser String
column = do b <- char '"'
            c <- many ((satisfy (/= '"')) <|> try escapedQuote)
            d <- char '"'
            return c
         <|>
         many1 (noneOf [',', '"', '\n'])
         <?> "escapedQuote"

escapedQuote :: Parser Char
escapedQuote = do string "\"\"" <?> "escapedQuote"
                  return '"'

comma :: Parser ()
comma = skipMany1 (char ',' <?> "comma")

splitCSV :: String -> [String]
splitCSV s = case (parse line "" s) of
             Left err -> error ("parse error at " ++ (show err))
             Right x -> x

testData :: [Test]
testData = [
            ["abc", "def"] ~=? splitCSV "abc,def\n",
            ["abc", "def"] ~=? splitCSV "\"abc\",def",
            ["a,bc", "def"] ~=? splitCSV "\"a,bc\",\"def\"\n",
            ["abc", "b\nbb", "ccc", "zzz", "y\"Y\"y", "xxx"] ~=? splitCSV "\"abc\",\"b\nbb\",\"ccc\",zzz,\"y\"\"Y\"\"y\",xxx\n"
           ]

main :: IO ()
main = do cs <- getContents
          mapM_ output (zip [1..] $ splitCSV cs)
  where output (n, col) = putStrLn $ (show n) ++ " => " ++ col

mapAccumL を使ってみました。
状態遷移表という感じ。
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import List
import Maybe

data State = F | E | G | Z

parse (F,xs) '\"' = ((E,xs),       Nothing)
parse (F,xs) '\n' = ((Z,[]),       Just xs)
parse (F,xs) ','  = ((F,[]),       Just xs)
parse (F,xs) c    = ((F,xs++[c]),  Nothing)

parse (E,xs) '\"' = ((G,xs),       Nothing)
parse (E,xs) c    = ((E,xs++[c]),  Nothing)

parse (G,xs) ','  = ((F,[]),       Just xs)
parse (G,xs) '\n' = ((F,[]),       Just xs)
parse (G,xs) '\"' = ((E,xs++"\""), Nothing)
parse (G,xs) c    = ((F,xs++[c]),  Nothing)

parse (Z,_)  c    = ((Z,[]),       Nothing)

splitCVS record = mapM_ putStrLn $ zipWith showFiled [1..]
  $ case s of
      (F, xs) -> fs ++ [xs]
      (E, xs) -> fs ++ [xs]
      _ -> fs
  where
    showFiled i f = (show i)++" => "++f
    (s, xss) = mapAccumL parse (F,[]) record
    fs = catMaybes xss

入力文字列を Maybe列(Nothing が EOF的役割)に変換してから
処理させてコードをすっきりさせました。

さらに Maybe を組み合わせて '\n' を認識したところで処理を
打ち切るようにさせたいのですが Maybe の嵐になるのでこのへんで
やめておきます。
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import List
import Maybe

data State = F | E | G | Z

parse (F,xs) (Just '\"') = ((E,xs),       Nothing)
parse (F,xs) (Just '\n') = ((Z,[]),       Just xs)
parse (F,xs) (Just ',')  = ((F,[]),       Just xs)
parse (F,xs) (Just c)    = ((F,xs++[c]),  Nothing)
parse (F,xs) Nothing     = ((Z,[]),       Just xs)

parse (E,xs) (Just '\"') = ((G,xs),       Nothing)
parse (E,xs) (Just c)    = ((E,xs++[c]),  Nothing)
parse (E,xs) Nothing     = ((Z,[]),       Just xs)

parse (G,xs) (Just ',')  = ((F,[]),       Just xs)
parse (G,xs) (Just '\n') = ((F,[]),       Just xs)
parse (G,xs) (Just '\"') = ((E,xs++"\""), Nothing)
parse (G,xs) (Just c)    = ((F,xs++[c]),  Nothing)
parse (G,xs) Nothing     = ((Z,[]),       Just xs)

parse (Z,_)  _           = ((Z,[]),       Nothing)

splitCVS record = mapM_ putStrLn $ zipWith showFiled [1..]
    $ catMaybes $ snd
    $ mapAccumL parse (F,[]) $ map Just record ++ [Nothing]
  where
    showFiled i f = (show i)++" => "++f

手抜き版。相互再帰lexer
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
main = putStr 
     . unlines
     . map (uncurry $ flip ((.) . (++) . show) (" => "++))
     . zip [1..]
     . splitCSV =<< getContents

splitCSV = lex0 [] ""

lex0 cs c "" = reverse (c:cs)
lex0 cs c ('"' :xs) = lex1 cs c xs
lex0 cs c (',' :xs) = lex0 (reverse c:cs) "" xs
lex0 cs c (x   :xs) = lex0 cs (x:c) xs
lex1 cs c ('"':',':xs) = lex0 (reverse c:cs) "" xs
lex1 cs c ('"':'"':xs) = lex1 cs ('"':c) xs
lex1 cs c (x      :xs) = lex1 cs (x  :c) xs

Index

Feed

Other

Link

Pathtraq

loading...