RFC 4180対応版 CSVレコードの分解
Posted feedbacks - Haskell
まずは愚直にparseしてみた。
mapM putStrLn のあたりが気持ち悪い?
showField中で出力しちゃう方がいいんだろうか。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | import List
parseF [] r = reverse r
parseF ('\"':cs) r = parseE cs [] r
parseF cs r = parseN cs [] r
parseE [] f r = parseF [] $ reverse f:r
parseE ('\"':',':cs) f r = parseF cs $ reverse f:r
parseE ('\"':'\n':cs) f r = parseF cs $ reverse f:r
parseE ('\"':'\"':cs) f r = parseE cs ('\"':f) r
parseE ('\"':c:cs) f r = parseE ('\"':cs) f r -- should be an error?
parseE (c:cs) f r = parseE cs (c:f) r
parseN [] f r = parseF [] $ reverse f:r
parseN ('\n':cs) f r = parseF [] $ reverse f:r
parseN (',':cs) f r = parseF cs $ reverse f:r
parseN (c:cs) f r = parseN cs (c:f) r
splitCVS record = mapM putStrLn $ snd $ mapAccumL showField 1 $ parseF record []
where showField i f = (i+1, (show i)++" => "++f)
main = do splitCVS "\"aaa\",\"b\nbb\",\"ccc\",zzz,\"y\"\"Y\"\"y\",xxx\n"; return ()
|
つづいてParsecに挑戦
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | import Text.ParserCombinators.Parsec
record = do fields <- sepBy field (char ',')
char '\n'
return fields
field = between (char '\"') (char '\"') quotedField
<|> many (noneOf ",\n")
quotedField = many $ (try $ do string "\"\""; return '\"') <|> noneOf "\""
splitCVS line = case (runParser record () "" line) of
Left err -> print err
Right fields -> do mapM putStrLn fields; return ()
main = splitCVS "\"aaa\",\"b\nbb\",\"ccc\",zzz,\"y\"\"Y\"\"y\",xxx\n"
|
Parsec の出番!UnitTest 付きで。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | import Text.ParserCombinators.Parsec
import Test.HUnit
line :: Parser [String]
line = do columns <- sepBy1 column comma
optional (char '\n')
return columns
column :: Parser String
column = do b <- char '"'
c <- many ((satisfy (/= '"')) <|> try escapedQuote)
d <- char '"'
return c
<|>
many1 (noneOf [',', '"', '\n'])
<?> "escapedQuote"
escapedQuote :: Parser Char
escapedQuote = do string "\"\"" <?> "escapedQuote"
return '"'
comma :: Parser ()
comma = skipMany1 (char ',' <?> "comma")
splitCSV :: String -> [String]
splitCSV s = case (parse line "" s) of
Left err -> error ("parse error at " ++ (show err))
Right x -> x
testData :: [Test]
testData = [
["abc", "def"] ~=? splitCSV "abc,def\n",
["abc", "def"] ~=? splitCSV "\"abc\",def",
["a,bc", "def"] ~=? splitCSV "\"a,bc\",\"def\"\n",
["abc", "b\nbb", "ccc", "zzz", "y\"Y\"y", "xxx"] ~=? splitCSV "\"abc\",\"b\nbb\",\"ccc\",zzz,\"y\"\"Y\"\"y\",xxx\n"
]
main :: IO Counts
main = runTestTT (test testData)
|
あ、カラム番号とともに出力するところまでが課題なのね。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | import Text.ParserCombinators.Parsec
import Test.HUnit
line :: Parser [String]
line = do columns <- sepBy1 column comma
optional (char '\n')
return columns
column :: Parser String
column = do b <- char '"'
c <- many ((satisfy (/= '"')) <|> try escapedQuote)
d <- char '"'
return c
<|>
many1 (noneOf [',', '"', '\n'])
<?> "escapedQuote"
escapedQuote :: Parser Char
escapedQuote = do string "\"\"" <?> "escapedQuote"
return '"'
comma :: Parser ()
comma = skipMany1 (char ',' <?> "comma")
splitCSV :: String -> [String]
splitCSV s = case (parse line "" s) of
Left err -> error ("parse error at " ++ (show err))
Right x -> x
testData :: [Test]
testData = [
["abc", "def"] ~=? splitCSV "abc,def\n",
["abc", "def"] ~=? splitCSV "\"abc\",def",
["a,bc", "def"] ~=? splitCSV "\"a,bc\",\"def\"\n",
["abc", "b\nbb", "ccc", "zzz", "y\"Y\"y", "xxx"] ~=? splitCSV "\"abc\",\"b\nbb\",\"ccc\",zzz,\"y\"\"Y\"\"y\",xxx\n"
]
main :: IO ()
main = do cs <- getContents
mapM_ output (zip [1..] $ splitCSV cs)
where output (n, col) = putStrLn $ (show n) ++ " => " ++ col
|
mapAccumL を使ってみました。 状態遷移表という感じ。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | import List
import Maybe
data State = F | E | G | Z
parse (F,xs) '\"' = ((E,xs), Nothing)
parse (F,xs) '\n' = ((Z,[]), Just xs)
parse (F,xs) ',' = ((F,[]), Just xs)
parse (F,xs) c = ((F,xs++[c]), Nothing)
parse (E,xs) '\"' = ((G,xs), Nothing)
parse (E,xs) c = ((E,xs++[c]), Nothing)
parse (G,xs) ',' = ((F,[]), Just xs)
parse (G,xs) '\n' = ((F,[]), Just xs)
parse (G,xs) '\"' = ((E,xs++"\""), Nothing)
parse (G,xs) c = ((F,xs++[c]), Nothing)
parse (Z,_) c = ((Z,[]), Nothing)
splitCVS record = mapM_ putStrLn $ zipWith showFiled [1..]
$ case s of
(F, xs) -> fs ++ [xs]
(E, xs) -> fs ++ [xs]
_ -> fs
where
showFiled i f = (show i)++" => "++f
(s, xss) = mapAccumL parse (F,[]) record
fs = catMaybes xss
|
入力文字列を Maybe列(Nothing が EOF的役割)に変換してから 処理させてコードをすっきりさせました。 さらに Maybe を組み合わせて '\n' を認識したところで処理を 打ち切るようにさせたいのですが Maybe の嵐になるのでこのへんで やめておきます。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | import List
import Maybe
data State = F | E | G | Z
parse (F,xs) (Just '\"') = ((E,xs), Nothing)
parse (F,xs) (Just '\n') = ((Z,[]), Just xs)
parse (F,xs) (Just ',') = ((F,[]), Just xs)
parse (F,xs) (Just c) = ((F,xs++[c]), Nothing)
parse (F,xs) Nothing = ((Z,[]), Just xs)
parse (E,xs) (Just '\"') = ((G,xs), Nothing)
parse (E,xs) (Just c) = ((E,xs++[c]), Nothing)
parse (E,xs) Nothing = ((Z,[]), Just xs)
parse (G,xs) (Just ',') = ((F,[]), Just xs)
parse (G,xs) (Just '\n') = ((F,[]), Just xs)
parse (G,xs) (Just '\"') = ((E,xs++"\""), Nothing)
parse (G,xs) (Just c) = ((F,xs++[c]), Nothing)
parse (G,xs) Nothing = ((Z,[]), Just xs)
parse (Z,_) _ = ((Z,[]), Nothing)
splitCVS record = mapM_ putStrLn $ zipWith showFiled [1..]
$ catMaybes $ snd
$ mapAccumL parse (F,[]) $ map Just record ++ [Nothing]
where
showFiled i f = (show i)++" => "++f
|
手抜き版。相互再帰lexer
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | main = putStr
. unlines
. map (uncurry $ flip ((.) . (++) . show) (" => "++))
. zip [1..]
. splitCSV =<< getContents
splitCSV = lex0 [] ""
lex0 cs c "" = reverse (c:cs)
lex0 cs c ('"' :xs) = lex1 cs c xs
lex0 cs c (',' :xs) = lex0 (reverse c:cs) "" xs
lex0 cs c (x :xs) = lex0 cs (x:c) xs
lex1 cs c ('"':',':xs) = lex0 (reverse c:cs) "" xs
lex1 cs c ('"':'"':xs) = lex1 cs ('"':c) xs
lex1 cs c (x :xs) = lex1 cs (x :c) xs
|





raynstard
#3389()
Rating1/1=1.00
[ reply ]