改行をBRタグに置き換える
Posted feedbacks - Nested
Flatten Hiddenテストケースがあるといいですねぇ。 Tag Soup ライブラリをつかってます #2757にすこし手を入れたものです
see: Tag Soup
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 | module Main (main) where
import Data.Char
import Data.List
import Data.Maybe
import Text.HTML.TagSoup
-- Parsing
-- Text.HTML.TagSoup.parseTags :: String -> [Tag Char]
-- Translating
translate :: [Tag Char] -> [Tag Char]
translate = concatMap trans
trans :: Tag Char -> [Tag Char]
trans tag = case tag of
TagOpen t attrs | ignore t -> [escapeTagOpen t attrs]
| otherwise -> [TagOpen t (filterAttr t attrs)]
TagClose t | ignore t -> [escapeTagClose t]
TagText s -> concat $ intersperse [TagOpen "br" [], TagClose "br"]
$ map ((:[]) . TagText) $ lines s
_ -> [tag]
ignore :: String -> Bool
ignore = flip notElem (map fst filterTable) . map toLower
escapeTagOpen t attrs
= TagText $ "<"++t++escape (' ':unwords (map showAttr attrs))++">"
escapeTagClose t
= TagText $ "</"++t++">"
filterAttr :: String -> [Attribute Char] -> [Attribute Char]
filterAttr t = filter ((maybe (const True) id (lookup t filterTable)) . fst)
filterTable :: [(String,String->Bool)]
filterTable = [("a",flip elem ["href","name"])
,("strong",const False)
,("br",const False)]
-- Showing
showTags :: [Tag Char] -> String
showTags [] = ""
showTags (TagOpen s [] : TagClose e : ts) | isEmptyTag s && s == e
= angle (s ++"/")++showTags ts
showTags (TagOpen s attrs : TagClose e : ts) | isEmptyTag s && s == e
= angle (s ++ ' ':unwords (map showAttr attrs)++"/")++showTags ts
showTags (TagOpen s [] : ts) | isEmptyTag s
= angle (s ++"/")++showTags ts
showTags (TagOpen s attrs : ts) | isEmptyTag s
= angle (s ++ ' ':unwords (map showAttr attrs)++"/")++showTags ts
showTags (t:ts)
= showTag t ++ showTags ts
showTag tag = case tag of
TagOpen t attrs -> angle $ t ++ ' ':unwords (map showAttr attrs)
TagClose t -> angle $ t ++ "/"
TagText s -> s
TagComment c -> angle $ "!--" ++ c ++ "--"
TagSpecial s t -> angle $ "!" ++ s ++ ' ':t
TagWarning s -> ""
angle :: String -> String
angle s = "<"++s++">"
isEmptyTag :: String -> Bool
isEmptyTag = flip elem ["br","hr"] -- not full fledged
showAttr :: Attribute Char -> String
showAttr (a,v) = a ++ "=" ++ q v
where q v = if elem sq v then dq:v++[dq]
else sq:v++[sq]
sq = '\''
dq = '\"'
escape :: String -> String
escape = concatMap esc
where esc '<' = "<"
esc '>' = ">"
esc '&' = "&"
esc c = [c]
--
main :: IO ()
main = do { putStrLn . showTags . translate . parseTags $ testdata1
; putStrLn . showTags . translate . parseTags $ testdata2
; putStrLn . showTags . translate . parseTags $ testdata3
; putStrLn . showTags . translate . parseTags $ testdata4
}
testdata1 = "<script foo=\"<script>alert('bar')</script>\">alert('foo')</script>"
testdata2 = "<script foo=\"<a href='link'>link</a>\">alert('foo')</script>"
testdata3 = "<a href='www.g>oogle.com'>link</a>"
testdata4 = "<a\n href='www.google.com'>link<!-- comment --></a> This<br> is an\n example."
{-
*Main> :main
<script foo="<script>alert('bar')</script>">alert('foo')</script>
<script foo="<a href='link'>link</a>">alert('foo')</script>
<a href='www.g>oogle.com'>link<a/>
<a href='www.google.com'>link<!-- comment --><a/> This <br/> is an<br/> example.
-}
|
通すタグに含まれる改行は一度アトリビュート毎に分解するからそこで 消えるかと思いきや、''で括られてたりすると残るのでした。 URLに改行が含まれるというのもアレですが… '<~>'に変換されたタグに含まれる改行をどうするか迷ったのですが 今回は<br/>には変換していません。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 | <?php
function safehtml($str)
{
$safetag=array('a'=>array(1,array('href','name')),'strong'=>array(1),'br'=>array(2));
$r=array();
$tags=array();
$offs=0;
while(preg_match('!<(\s*(/|)\s*(([^>\'"/]+|\'[^\']*\'|"[^"]*")*)(/|)\s*)>!',$str,$m1,PREG_OFFSET_CAPTURE,$offs))
{ $r[]=nl2br(substr($str,$offs,$m1[0][1]-$offs));
$offs=$m1[0][1]+strlen($m1[0][0]);
preg_match_all('!([^\s\'"=]+)(\s*=\s*("[^"]*"|\'[^\']*\'|[^\s]+)|)!im',$m1[3][0],$m2,PREG_SET_ORDER);
$tag=strtolower($m2[0][1]);
if(isset($safetag[$tag]))
{ if($safetag[$tag][0]&1)
{ if($m1[2][0])
{ if(array_search($tag,$tags)===false)
continue; // 開いてないタグは閉じない
while(($t=array_shift($tags))) // 開いたのと逆順に閉じる
{ $r[]="</$t>";
if($t==$tag)
break;
}
continue;
}
if(!$m1[5][0])
array_unshift($tags,$tag);
}
if($safetag[$tag][0]&2)
{ $m1[2][0]="";
$m1[5][0]="/";
}
$t=array($tag);
if(isset($safetag[$tag][1]) && !$m1[2][0])
{ array_shift($m2);
while($param=array_shift($m2))
{ if(array_search(strtolower($param[1]),$safetag[$tag][1])!==false)
$t[]=$param[0];
}
}
$r[]='<'.$m1[2][0].implode(" ",$t).$m1[5][0].'>';
}
else
$r[]=str_replace(array('<','>'),array('<','>'),$m1[0][0]);
}
$r[]=substr($str,$offs);
while(($tag=array_shift($tags))) // 閉じわすれタグを閉じる
$r[]="</$tag>";
return implode("",$r);
}
echo safehtml(<<<EOT
<a href='www.google.com'
target=_blank>link</a> <blink dummy='<'>and</blink><br> <strong onClick='alert("NG")'>cli
ck<br/>me!</strong> <z foo='<script>alert("Boo")</script>'>
EOT
);
?>
|
#2906を見て<>の対応が崩れている場合にうまく対応できていない事に気がつきました。 それから最後のタグより後ろの部分の改行を変換し忘れていたので合わせて修正しました。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 | <?php
function escapetag($str)
{
return str_replace(array('<','>'),array('<','>'),$str);
}
function safehtml($str)
{
$safetag=array('a'=>array(1,array('href','name')),'strong'=>array(1),'br'=>array(2));
$r=array();
$tags=array();
$offs=0;
while(preg_match('!<(\s*(/|)\s*(([^<>\'"/]+|\'[^\']*\'|"[^"]*")*)(/|)\s*)>!',$str,$m1,PREG_OFFSET_CAPTURE,$offs))
{ $r[]=nl2br(escapetag(substr($str,$offs,$m1[0][1]-$offs)));
$offs=$m1[0][1]+strlen($m1[0][0]);
preg_match_all('!([^\s\'"=]+)(\s*=\s*("[^"]*"|\'[^\']*\'|[^\s]+)|)!im',$m1[3][0],$m2,PREG_SET_ORDER);
$tag=strtolower($m2[0][1]);
if(isset($safetag[$tag]))
{ if($safetag[$tag][0]&1)
{ if($m1[2][0])
{ if(array_search($tag,$tags)===false)
continue; // 開いてないタグは閉じない
while(($t=array_shift($tags))) // 開いたのと逆順に閉じる
{ $r[]="</$t>";
if($t==$tag)
break;
}
continue;
}
if(!$m1[5][0])
array_unshift($tags,$tag);
}
if($safetag[$tag][0]&2)
{ $m1[2][0]="";
$m1[5][0]="/";
}
$t=array($tag);
if(isset($safetag[$tag][1]) && !$m1[2][0])
{ array_shift($m2);
while($param=array_shift($m2))
{ if(array_search(strtolower($param[1]),$safetag[$tag][1])!==false)
$t[]=$param[0];
}
}
$r[]='<'.$m1[2][0].implode(" ",$t).$m1[5][0].'>';
}
else
$r[]=escapetag($m1[0][0]);
}
$r[]=nl2br(escapetag(substr($str,$offs)));
while(($tag=array_shift($tags))) // 閉じわすれタグを閉じる
$r[]="</$tag>";
return implode("",$r);
}
echo safehtml(<<<EOT
<a href='www.google.com'
target=_blank>link</a>> <blink dummy='<'>and</blink><<br> <strong onClick='alert("NG")'>cli
ck<br/>me!</strong> <z foo='<script>alert("Boo")</script>'><
EOT
);
?>
|
前回のreplaceをwhileループに変更し、マッチしなかった箇所に対しても置換処理をする。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 | var input = "<a title=\"(>_<;)\" href='www.google.com' name='hoge'\n\
>link</a> <blink>and</blink>\n <strong onClick='alert(\"NG\")'>click<br>me!</strong>";
function deleteAttr(attr) {
return attr.replace(/\s+(\w+)\s*=\s*(["'])((?:[\r\n]|.)*?)\2/g,
function(all, name, q, value) {
return name.match(/name|href/i) ? all : '';
});
}
function escapeHtml(h) {
return h.replace(/&/g, '&').replace(/</g, '<').replace(/\r?\n/g, '<br/>');
}
function escapeTag(all, fslash, tag, attrs, q, rslash) {
switch(tag.toUpperCase()) {
case 'STRONG' : // drop through
case 'BR' : attrs = ''; rslash = '/'; break;
case 'A' : attrs = deleteAttr(attrs); break;
default : return escapeHtml(all);
}
return '<' + fslash + tag + attrs + rslash + '>';
}
function filter(html) {
var reg = /<(\/?)(\w+)((?:\s+\w+\s*=\s*(["'])(?:[\r\n]|.)*?\4)*)?(\/?)>/gmi;
var output = [], m, i=0;
while(m = reg.exec(html)) {
output.push(escapeHtml(html.substring(i, m.index)));
output.push(escapeTag.apply(null, m));
i= m.index + m[0].length;
}
output.push(escapeHtml(html.substring(i, html.length)));
return output.join('');
}
document.body.innerHTML=filter(input);
|
前のお題の#2759で実装したクラスを継承します。 #2759には下の方にテスト用の式をトップレベルに 書いてあるので適当に処理してください。 置き換えたタグ内の改行も、引用符にくくられていようと 無視して<br/>に置換しています。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | from HTMLParser2 import HTMLParser2
class HTMLParser3(HTMLParser2):
def replace(self, s):
return s.replace('&', '&').replace('<', '<').replace('>', '>').replace('\n', '<br/>')
def handle_starttag(self, tag, attrs):
if tag == 'br':
self.buf += '<br/>'
else:
HTMLParser2.handle_starttag(self, tag, attrs)
def handle_data(self, data):
self.buf += data.replace('\n', '<br/>')
def f(s):
h = HTMLParser3()
h.feed(s)
print s
print h.buf
if __name__ == '__main__':
f('''<script foo="<script>alert('bar')</script>">alert('foo')</script>''')
f('''<script foo="<a href='link'>link</a>">alert('foo')</script>''')
f('''<a href='www.g>oogle.com'>link</a>''')
f('''<br>abc\ndef\n<br>''')
|
同じく前回の投稿 #2763 を再利用して。 折角、フィルタ登録型にしたのに、 フィルタ探索関数をクロージャにしてしまったのが裏目にでた。 MyHTMLParserを継承してますが、変更部分は filterメソッド内の find_filter のみです。 今回のお題の追加分は、 fix_br_tag, nl_to_br_in_text 。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | # wget http://ja.doukaku.org/comment/2763/download/ -O MyHTMLParser.py
from MyHTMLParser import MyHTMLParser, TEXT,TAG
class MyHTMLParser2(MyHTMLParser):
def filter(self, (state,tag,attrs,text)):
# find_filter = lambda x:x.get((state,tag.lower()), lambda x:x)
def find_filter(filters):
tag_ = tag.lower()
if filters.has_key((state,tag_)):
return filters.get((state,tag_))
return filters.get((state,None), lambda x:x)
tag,attrs = find_filter(self.tag_filters)((tag,attrs))
attrs = find_filter(self.attr_filters)(attrs)
text = find_filter(self.text_filters)(text)
return state,tag,attrs,text
def test(html):
import re
def allow_attrs(*names):
return lambda attrs: [(k,quote(v)) for k,v in attrs if k.lower() in names]
def remove_all_attrs(attrs):
return []
def fix_br_tag((tag,attrs)):
return ('br/',attrs)
def nl_to_br_in_text(text):
return re.sub(text, r"(\r\n|\r|\n)","<br />")
p = MyHTMLParser2()
p.allow_tags += ['a', 'br', 'strong']
p.attr_filters[(TAG,'a')] = allow_attrs('href', 'name')
p.attr_filters[(TAG,'br')] = remove_all_attrs
p.attr_filters[(TAG,'strong')] = remove_all_attrs
p.tag_filters[(TAG,'br')] = fix_br_tag
p.text_filters[(TEXT,None)] = nl_to_br_in_text
p.parse(html)
|
Copy&Pasteしたらライブラリのimport漏れ。 test関数内import re の後に、from urllib import quote 追加して下さい。
ややこしいので最初に一括エスケープするように方針変更。タグ中の改行はスペースに置き換えて対処。
javascript:with(document.body)(function(s,i,g,n){i=/<(\/?(?:(a)|br|strong)\b\s?)((?:".*?"|'.*?'|[^>])*?)(?=\/?>)/gi,g=/\b(?:name|href)\s*=\s*(?:".*?"|'.*?'|[^\s>]*)/gi,n=/\r\n|[\r\n]|<br\s*>/gi;innerHTML=s.replace(/&/g,'&').replace(/</g,'<').replace(i,function(_,t,A,a){return('<'+t+(A?(a.match(g)||[]).join(' '):'')).replace(n,' ')}).replace(n,'<br/>')})(innerHTML)
javascript:with(document.body)(function(s,i,g,n){i=/<(\/?(?:(a)|br|strong)\b\s?)((?:".*?"|'.*?'|[^>])*?)(?=\/?>)/gi,g=/\b(?:name|href)\s*=\s*(?:".*?"|'.*?'|[^\s>]*)/gi,n=/\r\n|[\r\n]|<br\s*>/gi;innerHTML=s.replace(/&/g,'&').replace(/</g,'<').replace(i,function(_,t,A,a){return('<'+t+(A?(a.match(g)||[]).join(' '):'')).replace(n,' ')}).replace(n,'<br/>')})(innerHTML)
1 2 3 4 5 6 7 8 | function doukaku57(s){
var xOK = /<(\/?(?:(a)|br|strong)\b\s?)((?:".*?"|'.*?'|[^>])*?)(?=\/?>)/gi,
xNH = /\b(?:name|href)\s*=\s*(?:".*?"|'.*?'|[^\s>]*)/gi,
xBR = /\r\n|[\r\n]|<br\s*>/gi;
return s.replace(/&/g,'&').replace(/</g,'<').replace(xOK, function(_, tag, A, ats){
return ('<'+ tag + (A ? (ats.match(xNH) || []).join(' ') : '')).replace(xBR, ' ');
}).replace(xBR, '<br/>');
}
|
2706をベースにしてます。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | import java.util.regex._
class ExtendedString(self:String) {
def gsub(reg:Pattern, f:(Matcher)=>String):String = {
val result = new StringBuffer
val m = reg.matcher(self)
while(m.find) m.appendReplacement(result, f(m))
m.appendTail(result)
result.toString
}
def gsub(reg:String, f:(Matcher)=>String):String = gsub(Pattern.compile(reg), f)
}
implicit def string2ext(self:String) = new ExtendedString(self);
object htmlEscape{
lazy val tagRegex = Pattern.compile(
"""(<\s*/?\s*)([^"'<\s>]*\s*)([^"'<>]*(?:"[^"]*"[^"'<>]*|'[^']*'[^"'<>]*)*)((?:>|(?=<)|$(?!\n)))"""
)
lazy val attrRegex = Pattern.compile(
"""[\s'"](\w+)\s*=\s*([^\s'">]+|'[^']+'|\"[^"]+")"""
, Pattern.DOTALL | Pattern.CASE_INSENSITIVE)
lazy val tagAllowed = Set("a", "br", "strong")
lazy val attrAllowed = Map("a" -> Set("href", "name"))
def replace(s:String, rs:Pair[String,String]*) = {
rs.foldLeft(s){(r,v)=> r.replace(v._1, v._2)}
}
def apply(html:String) = {
val other = tagRegex.split(html)
val sb = new StringBuilder
val m = tagRegex.matcher(html)
var i = -1;while({
i=i+1;
try{
sb.append(replace(other(i), ("<","<"), (">",">"), ("\n","<br/>")));
}catch{case _=> ()};
m.find}
) {
val tag = replace(m.group(2).toLowerCase, ("/","")).trim
sb.append(
(if(tagAllowed.contains(tag)){
val attrs = m.group(3).gsub(attrRegex, (m2:Matcher) => {
if(attrAllowed.getOrElse(tag, Set[String]()).contains(m2.group(1).toLowerCase)) {
m2.group(0)
}else {
""
}
})
if(tag == "br") List("<br/>")
else List(m.group(1),m.group(2), attrs, m.group(4))
}else {
List(replace(m.group(1), ("<", "<")), m.group(2),
replace(m.group(3), ("<", "<"), (">", ">")), m.group(4))
}).mkString("")
)
}
sb.toString
}
}
|
ああやってしまった・・・Markdownにしてるの忘れて行頭に#2706を書いてしまいました。修正お願いできますか?
修正しました。
ありがとうございました。
以後気をつけます。
Squeak Smalltalk で。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | | string accepts in out upToAnyOf letters separators replaceCRs |
string := '<a title="(>_<;)" href=''www.google.com''
name=''hoge'' target=_blank>link</a> <blink>a
nd</blink> <strong onClick=''alert("NG")''>click<br>me!</strong>'.
accepts := {#a->#(name href). #strong->#(). #br->#()} as: Dictionary.
string := string copyReplaceAll: '<br>' with: '<br/>'.
in := string readStream.
out := String new writeStream.
upToAnyOf := [:arr | String streamContents: [:ss |
arr := arr copyWith: nil.
[arr includes: in peek] whileFalse: [ss nextPut: in next]]].
letters := Character alphabet asArray, Character alphabet asUppercase.
separators := Character separators, #($/ $>).
replaceCRs := [:str | str copyReplaceAll: String cr with: '<br/>'].
[out nextPutAll: (replaceCRs value: (in upTo: $<) escapeEntities). in atEnd] whileFalse: [
| tag lt isClose isAccepted blank rest |
(isClose := in peek == $/) ifTrue: [in next].
tag := upToAnyOf value: separators.
lt := '<', (isClose ifTrue: ['/'] ifFalse: ['']).
(isAccepted := accepts keys includes: tag asLowercase) ifFalse: [lt := lt escapeEntities].
out nextPutAll: lt, tag.
[blank := upToAnyOf value: letters, '>'. {nil. $>} includes: in peek] whileFalse: [
| attr equal value quote |
attr := upToAnyOf value: #($= $>).
equal := in peek == $= ifTrue: [in next asString] ifFalse: [''].
value := (#($' $") includes: (quote := in peek))
ifTrue: [quote asString, (in next; upTo: quote), quote asString]
ifFalse: [upToAnyOf value: #($ $>)].
out nextPutAll: (isAccepted
ifFalse: [blank, attr, equal, value escapeEntities]
ifTrue: [((accepts at: tag) includes: attr)
ifTrue: [blank, attr, equal, value] ifFalse: ['']])].
rest := blank, (in peek == $> ifTrue: [in next asString] ifFalse: ['']).
out nextPutAll: (isAccepted ifTrue: [rest] ifFalse: [rest escapeEntities])].
World findATranscript: nil.
Transcript cr; show: out contents
"=> <a href='www.google.com'
name='hoge'>link</a> <blink>a<br/>nd</blink> <strong>click<br/>me!</strong> "
|
今回の改行の処理の対応と、 前回のものが、属性のなかのタグの途中で改行が入った "<foo clear='<scr ipt>foo>'>foo</foo>" のようなものを上手く処理できていないことに気付いたので直し… …たつもりですが、私には問題が難し過ぎたようで、 何が何だが分からないものになり果てました…。 タグのエスケープ処理のため2種の文字を予約で消費してしまいます。 下記では、ここへの投稿のために"【"と"】"を使いました。 また、前回はライブラリが見付けられずAllegroのparse-htmlを 使用して作成しましたが、Clikiにポータブル版のpxmlutilsが ありました。 http://www.cliki.net/pxmlutils 今回はそれを導入してsbclで動作確認しています。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | (use-package :net.html.parser)
(use-package :cl-ppcre)
(use-package :url-rewrite)
(defun html-filter-2 (str)
(regex-replace-all-lis
'(("】" . ">") ("【" . "<") ("\\n" . "<br />"))
(html-filter-1
(prop-filter-2 (pickup-tag (<>-to-escape (prop-filter-1 str)))))))
(defun regex-replace-all-lis (repl-alist str
&optional &key case-insensitive-mode)
(reduce
(lambda (res item)
(let ((scan (create-scanner
(car item)
:case-insensitive-mode case-insensitive-mode)))
(regex-replace-all scan res (cdr item))))
repl-alist :initial-value str))
(defun <>-to-escape (str)
(regex-replace-all-lis '((">" . "】")("<" . "【")) str))
(defun <>-to-ltgt (str)
(regex-replace-all-lis '((">" . ">")("<" . "<")) str))
(defun pickup-tag (str)
(regex-replace-all-lis
'(("【(a\\s*[^\\s】]*)】" . "<\\1>")
("【(/a\\s*)】" . "</a>")
("【(strong\\s*[^\\s】]*)】" . "<\\1>")
("【(/strong\\s*)】" . "</strong>")
("【(br\\s*[^\\s】]*)】" . "<br />"))
str))
(defun prop-filter-1 (str)
(prop-filter-* str "('[^']*')" #'url-encode))
(defun prop-filter-2 (str)
(prop-filter-* str "(【[^】]*】)" #'<>-to-ltgt))
(defun prop-filter-* (str scan-pat repl-func)
(let ((scan (create-scanner scan-pat :case-insensitive-mode t)))
(regex-replace-all scan str
(lambda (match &rest rest)
(declare (ignore rest))
(if (scan "[<>]" match)
(funcall repl-func match)
match))
:simple-calls t)))
(defun html-filter-1 (string)
(let ((form (sanitize-html (parse-html string))))
(apply #'concatenate 'string (build-html form))))
(defun sanitize-html (form)
(mapcar
(lambda (item)
(if (atom item)
item
(let ((keys `(,(car item) ,(and (consp (car item)) (caar item)))))
(cond ((member :a keys)
`(,(car item) ,@(sanitize-html (cdr item))))
((member :strong keys)
`(:strong ,@(sanitize-html (cdr item))))
((member :br keys) :br)
('T "")))))
form))
(defun build-html (form)
(if (atom form)
form
(cond ((and (atom (car form)) (eq :br (car form)))
`(,(br) ,@(build-html (cdr form))))
((consp (car form))
`(,(let* ((top (car form))
(keys `(,(car top) ,(and (consp (car top)) (caar top)))))
(cond ((member :a keys) (build-a top))
((member :strong keys) (build-strong top))
('T top)))
,@(build-html (cdr form))))
('T `(,(build-html (car form))
,@(build-html (cdr form)))))))
(defun br () "<br />")
(defun build-strong (form)
(format nil "<strong>~{~A~}</strong>" (build-html (cdr form))))
(defun build-a (form)
(let ((tag (car form))
(body (build-html (cdr form))))
(if (and (consp (car form))
(member (second tag) '(:href :name)))
(format nil "<a ~(~A~)='~A'>~{~A~}</a>"
(second tag)
(put-dotslash (third tag)) body)
(format nil "<a>~{~A~}</a>" (build-html (cdr form))))))
(defun put-dotslash (str)
(let ((s (create-scanner "^([Hh][Tt][Tt][Pp][Ss]*://|^/|\./)")))
(if (scan s str)
str
(concatenate 'string "./" str))))
(defun prop-maker (lst)
(do ((l lst (cddr l))
result)
((endp l) (nreverse result))
(push (format nil "~A=\"~A\""
(car l) (<>-to-ltgt (cadr l))) result)))
|
#2911 を改変。変換と出力を分離しました。 74 行目までは同じなのでそれ以降だけ。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | let filter_attributes tagname attrlist =
List.filter
(fun (attrname, _) -> is_allowed_attribute tagname attrname)
attrlist
let filter_input src = function
| String _ as x -> x
| SingleTag (tagname, attrs, p, q) ->
if is_allowed_tag tagname then
SingleTag(tagname, filter_attributes tagname attrs, p, q)
else
String (String.sub src p (q-p))
| OpenTag (tagname, attrs, p, q) ->
if tagname === "br" then
SingleTag(tagname, [], p, q)
else if is_allowed_tag tagname then
OpenTag(tagname, filter_attributes tagname attrs, p, q)
else
String (String.sub src p (q-p))
| CloseTag (tagname, p, q) as x ->
if is_allowed_tag tagname then x else
String (String.sub src p (q-p))
| Eof -> Eof
let sanitizing_output buf string p q =
for x = p to q-1 do
match string.[x] with
| '<' -> Buffer.add_string buf "<"
| '>' -> Buffer.add_string buf ">"
| '&' -> Buffer.add_string buf "&"
| '\n' -> Buffer.add_string buf "<br/>"
| c -> Buffer.add_char buf c
done
let output_attribute buf (attrname, value) =
let quote = if String.contains value '"' then '\'' else '"' in
Printf.bprintf buf " %s=%c%s%c" attrname quote value quote
let output_tag buf name attrs is_single =
Printf.bprintf buf "<%s" name;
List.iter (output_attribute buf) attrs;
if is_single then Buffer.add_char buf '/';
Buffer.add_char buf '>'
let output_text_fragment buf = function
| String s ->
sanitizing_output buf s 0 (String.length s)
| SingleTag (s, attrs, _, _) ->
output_tag buf s attrs true
| OpenTag (s, attrs, _, _) ->
output_tag buf s attrs false
| CloseTag (s, _, _) ->
Printf.bprintf buf "</%s>" s
| Eof -> ()
let filter_text text =
let list = parse_input text in
let flist = List.map (filter_input text) list in
let buf = Buffer.create (String.length text) in
List.iter (output_text_fragment buf) flist;
Buffer.contents buf
}
|
#2906を元に改造しました。 </br>は取り除くように変更しています(<br></br>の変換結果が不正になるのを防ぐため)。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 | import java.util.regex.*;
public class Sample {
private static final Pattern TAG_FILTER = Pattern.compile
("<(¥¥w+)((¥¥s+¥¥w+(¥¥s*=¥¥s*(¥"[^¥"]*¥"|'[^']*'|[¥¥w-:]*))?)*)¥¥s*/?¥¥s*>");
private static final Pattern END_TAG_FILTER = Pattern.compile
("(?i)</(A|STRONG)¥¥s*>");
private static final Pattern ATTR_FILTER = Pattern.compile
("(¥¥w+)¥¥s*=¥¥s*(¥"[^¥"]*¥"|'[^']*'|[¥¥w-:]*)");
public static String sanitizing(String fragment) {
fragment = fragment.replaceAll("[¥¥p{Cntrl}&&[^¥¥s]]", "");
Matcher m = TAG_FILTER.matcher(fragment);
StringBuffer sb = new StringBuffer();
while (m.find()) {
if ("A".equalsIgnoreCase(m.group(1))) {
String href = null, name = null;
Matcher m2 = ATTR_FILTER.matcher(m.group(2));
while (m2.find()) {
if ("href".equalsIgnoreCase(m2.group(1))) {
href = m2.group(2);
} else if ("name".equalsIgnoreCase(m2.group(1))) {
name = m2.group(2);
}
}
String tag = "¥001"+m.group(1) + ((href ! |




にしお
#3413()
Rating-2/2=-1.00
また、ユーザの入力注の<br>は<br/>に変換してください。
このお題はperezvonさんの提案を元にした三部作の二問目です。ご協力ありがとうございました。
[ reply ]