一部のHTMLタグを通すフィルタ
Posted feedbacks - OCaml
ocamllex で。 こんなに長くなるとは思わなかった。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | {
type attributes = (string * string) list (* name, value *)
type input =
| String of string
| SingleTag of string * attributes * int * int
| OpenTag of string * attributes * int * int
| CloseTag of string * int * int
| Eof
}
let tag_start = "<"
let close_tag_start = "</"
let tag_end = ">"
let single_tag_end = "/>"
let tag_constituent = ['a' - 'z' 'A' - 'Z']
let attr_constituent = ['a' - 'z' 'A' - 'Z' '-']
let white = [' ' '\t' '\n' '\r']
let quote_omittable = ['a' - 'z' 'A' - 'Z' '0' - '9' '.' '-']
rule main = parse
| tag_start { tag (Lexing.lexeme_start lexbuf) lexbuf }
| close_tag_start { close_tag (Lexing.lexeme_start lexbuf) lexbuf }
| [^'<']+ as s { String s }
| eof { Eof }
and tag p = parse
| tag_constituent+ as name { attr_list p name [] lexbuf }
| (_#tag_constituent)+ as s { String ("<"^s) }
and attr_list p tagname attrs = parse
| white+ { attr_list p tagname attrs lexbuf }
| (attr_constituent+ as name) white* '=' {
let value = attr_value lexbuf in
attr_list p tagname ((name, value)::attrs) lexbuf
}
| tag_end {
let endpos = Lexing.lexeme_end lexbuf in
OpenTag(tagname, List.rev attrs, p, endpos)
}
| single_tag_end {
let endpos = Lexing.lexeme_end lexbuf in
SingleTag(tagname, List.rev attrs, p, endpos)
}
| _ { attr_list p tagname attrs lexbuf }
and attr_value = parse
| white+ { attr_value lexbuf }
| '\'' ([^'\'']* as value) '\'' { value }
| '"' ([^'"']* as value) '"' { value }
| (quote_omittable* as value) { value }
| _ { attr_value lexbuf }
and close_tag p = parse
| (tag_constituent+ as name) [^'>']* tag_end {
CloseTag (name, p, Lexing.lexeme_start lexbuf)
}
| (_#tag_constituent)+ as text {
String ("</"^text)
}
{
let parse_input s =
let lexbuf = Lexing.from_string s in
let rec loop inputs =
match main lexbuf with
| Eof -> List.rev inputs
| x -> loop (x::inputs)
in loop []
let (===) s1 s2 = String.uppercase s1 = String.uppercase s2
let is_allowed_tag tagname =
List.exists (fun s -> tagname === s) ["a"; "br"; "strong"]
let is_allowed_attribute tagname attrname =
(* めんどくさくなったので ad-hoc *)
tagname === "a" && (attrname === "href" || attrname === "name")
let sanitizing_output buf string p q =
for x = p to q-1 do
match string.[x] with
| '<' -> Buffer.add_string buf "<"
| '>' -> Buffer.add_string buf ">"
| '&' -> Buffer.add_string buf "&"
| c -> Buffer.add_char buf c
done
let output_attribute_if_allowed buf tagname (attrname, value) =
if is_allowed_attribute tagname attrname then
let quote = if String.contains value '"' then '\'' else '"' in
Printf.bprintf buf " %s=%c%s%c" attrname quote value quote
let output_tag buf name attrs is_single =
Printf.bprintf buf "<%s" name;
List.iter (output_attribute_if_allowed buf name) attrs;
if is_single then Buffer.add_char buf '/';
Buffer.add_char buf '>'
let output_text_fragment buf src = function
| String s ->
sanitizing_output buf s 0 (String.length s)
| SingleTag (s, attrs, p, q) ->
if is_allowed_tag s then
output_tag buf s attrs true
else
sanitizing_output buf src p q
| OpenTag (s, attrs, p, q) ->
if is_allowed_tag s then
output_tag buf s attrs false
else
sanitizing_output buf src p q
| CloseTag (s, p, q) ->
if is_allowed_tag s then
Printf.bprintf buf "</%s>" s
else
sanitizing_output buf src p q
| Eof -> ()
let filter_text text =
let list = parse_input text in
let buf = Buffer.create (String.length text) in
List.iter (output_text_fragment buf text) list;
Buffer.contents buf
}
|


にしお
#3410()
Rating0/0=0.00
このお題はperezvonさんの提案を元にしています。ありがとうございました。 ただ、いきなりだと難しいかと思ったので、肝の部分以外を先に出題しました。このお題は続編で徐々に難しくなっていきます。
追記:属性に<や>が含まれてしまうケースに漏れのある解答が多いようなのでテストケースを追加します。 これは「この出力なら十分」という意味です。この出力の通りでなければいけないという意味ではありません。 <script foo="<script>alert('bar')</script>">alert('foo')</script> <script foo="<script>alert('bar')</script>">alert('foo')</script> <script foo="<a href='link'>link</a>">alert('foo')</script> <script foo="<a href='link'>link</a>">alert('foo')</script> <a href='www.g>oogle.com'>link</a> <a href="./www.g%3Eoogle.com">link</a>[ reply ]