文字列を指定されたバイト数で分割
Posted feedbacks - Flatten
Nested Hidden1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | (defn splitBytes [string len encode]
(let [count-bytes (fn [cs] (-> (apply str cs) (.getBytes encode) count))
len1 (fn [cs]
(let [n (count cs)]
(loop [i 0]
(if (and (< i n)
(<= (count-bytes (take (inc i) cs)) len))
(recur (inc i))
i))))
step (fn step [cs]
(when cs
(let [[h t] (split-at (len1 cs) cs)]
(when h
(lazy-cons h (step t))))))]
(step string)))
(doseq [encode ["Shift_JIS" "UTF-8"]]
(-> (splitBytes "あいうえおabcdeかきくけこfghij" 10 encode) prn))
;; ((\あ \い \う \え \お) (\a \b \c \d \e \か \き) (\く \け \こ \f \g \h \i) (\j))
;; ((\あ \い \う) (\え \お \a \b \c \d) (\e \か \き \く) (\け \こ \f \g \h \i) (\j))
;; nil
|
UTF-8で391ミリ秒
Shift-Jisで641ミリ秒
(共に出力にかかる時間は抜き)
でした。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 | using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
namespace どう書く_org_文字列を指定されたバイト数で分割 {
class Program {
static void Main(string[] args) {
string data;
using(StreamReader sr = new StreamReader(@"C:\Documents and Settings\Owner\デスクトップ\37kagawa\37KAGAWA.CSV",Encoding.GetEncoding("Shift-Jis"))) {
data = sr.ReadToEnd();
}
long ticks = Environment.TickCount;
IList<string> r = splitBytes(data, 2000, Encoding.UTF8);
string time = (Environment.TickCount - ticks).ToString() + "ミリ秒";
foreach(string str in r) {
Console.WriteLine(str);
}
Console.WriteLine("UTF-8:"+time);//391ミリ秒
ticks = Environment.TickCount;
r = splitBytes(data, 2000, Encoding.GetEncoding("Shift-Jis"));
time = (Environment.TickCount - ticks).ToString() + "ミリ秒";
foreach(string str in r) {
Console.WriteLine(str);
}
Console.WriteLine("Shift-Jis:"+time);//641ミリ秒
Console.ReadLine();
}
static IList<string> splitBytes(string value, int byteLengh, Encoding encoding) {
List<string> r = new List<string>();
StringBuilder tmp = new StringBuilder();
for(int i = 0; i < value.Length; i++) {
tmp.Append(value[i]);
if(encoding.GetByteCount(tmp.ToString()) > byteLengh) {
tmp.Remove(tmp.Length - 1, 1);
i--;
r.Add(tmp.ToString());
tmp.Remove(0, tmp.Length);
continue;
}
}
r.Add(tmp.ToString());
return r;
}
}
}
|
Squeak Smalltalk で。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | | splitBytes |
splitBytes := [:str :limit :enc |
| stream |
stream := Array new writeStream.
[str notEmpty] whileTrue: [
| max |
max := ((1 to: str size)
detect: [:each | (((str first: each) convertToEncoding: enc) byteSize > limit)]
ifNone: [str size + 1]) - 1.
stream nextPut: (str first: max).
str := str allButFirst: max].
stream contents].
splitBytes valueWithArguments: #('あいうえおabcdeかきくけこfghij' 10 'sjis').
"=> #('あいうえお' 'abcdeかき' 'くけこfghi' 'j') "
splitBytes valueWithArguments: #('あいうえおabcdeかきくけこfghij' 10 'utf8').
"=> #('あいう' 'えおabcd' 'eかきく' 'けこfghi' 'j') "
|
文字サイズの取得はmbstring.hを使用。
時間の測定にはclock関数を使用(つ 参考ページ)
Cなので比較的速いとは思います。
香川県(69KB)で15ms以下。北海道(842KB)で100ms付近。(マシンはP4 3GHz)
see: C言語: 実行時間測定の方法
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | #include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <mbstring.h>
#include <time.h>
/* splitBytes : 文字列から指定した長さを切り出す
* 引数 data : 文字列データ
* data_len : 文字列データの長さ
* divide_len : 切り出す長さ
* out : 切り出した文字列を格納する領域
* 戻り値 : 切り出した長さ
*/
int splitBytes(char *data, int data_len, int divide_len, char out[])
{
char *p = data;
char *q = out;
while (p < data + data_len) {
int clen = _mbclen(p);
if (q - out + clen > divide_len)
break;
_mbccpy(q, p);
p += clen;
q += clen;
}
*q = '\0';
return q - out;
}
int readFile(char *filename, char *buf, int buf_len)
{
FILE *fp;
int read_len;
fp = fopen(filename, "r");
read_len = fread(buf, 1, buf_len, fp);
fclose(fp);
return read_len;
}
#define DIVIDE_LEN 2000
int main()
{
char test[850 * 1024];
char buf[DIVIDE_LEN+1];
int data_len;
char *s;
clock_t start_tm, end_tm;
// data_len = readFile("01HOKKAI.CSV", test, sizeof(test));
data_len = readFile("37KAGAWA.CSV", test, sizeof(test));
start_tm = clock();
for (s = test; 0 < data_len; ) {
int cut_len = splitBytes(s, data_len, DIVIDE_LEN, buf);
// printf("buf=[%s] cut_len=[%d]\n", buf, cut_len);
s += cut_len;
data_len -= cut_len;
};
end_tm = clock();
printf("実行時間 %f ms\n", (double)(end_tm - start_tm)/CLOCKS_PER_SEC*1000);
return 0;
}
|
Ruby1.9ならではの感じで。
see: Ruby M17N の設計と実装
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | # -*- coding: utf-8 -*-
def splitBytes(str, limit, encoding)
str.encode(encoding).each_char.inject([""]){|ret, char|
raise if char.bytesize > limit
ret << "" if (ret[-1]+char).bytesize > limit
ret[-1] += char
ret
}
end
p splitBytes("あいうえおabcdeかきくけこfghij", 10, "Shift_JIS")
p splitBytes("あいうえおabcdeかきくけこfghij", 10, "UTF-8")
# ruby splitbytes.rb
# ["\202\240\202\242\202\244\202\246\202\250", "abcde\202\251\202\253", "\202\255\202\257\202\261fghi", "j"]
# ["あいう", "えおabcd", "eかきく", "けこfghi", "j"]
|
Core2 Duo 2.0GHzで実行したところ、
UTF-8 で約70ms、Shift_JIS で約140msとなりました。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
public class Sample234 {
public static List<String> splitBytes(CharSequence input, int splitSize, Charset charset) {
List<String> result = new ArrayList<String>();
StringBuilder builder = new StringBuilder(splitSize);
int workLength = 0;
for (int index = 0, maxIndex = input.length(); index < maxIndex; index++) {
char c = input.charAt(index);
int length = Character.toString(c).getBytes(charset).length;
if (workLength + length > splitSize) {
result.add(builder.toString());
builder.setLength(0);
workLength = 0;
}
builder.append(c);
workLength += length;
}
if (builder.length() > 0) {
result.add(builder.toString());
}
return result;
}
public static void main(String[] args) throws IOException {
//System.out.println(splitBytes("あいうえおabcdeかきくけこfghij", 10, Charset.forName("Shift_JIS")));
//System.out.println(splitBytes("あいうえおabcdeかきくけこfghij", 10, Charset.forName("UTF-8")));
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(".\\37KAGAWA.CSV"), "Shift_JIS"));
StringBuilder builder = new StringBuilder();
char[] buffer = new char[4096];
while (reader.read(buffer) >= 0) {
builder.append(buffer);
}
long start = System.currentTimeMillis();
List<String> list = splitBytes(builder, 2000, Charset.forName("UTF-8"));
long elapse = System.currentTimeMillis() - start;
System.out.println("UTF-8: " + elapse + "(ms)");
start = System.currentTimeMillis();
list = splitBytes(builder, 2000, Charset.forName("Shift_JIS"));
elapse = System.currentTimeMillis() - start;
System.out.println("Shift_JIS: " + elapse + "(ms)");
}
}
|
1 | "あいうえおabcdeかきくけこfghij"を10で行揃えして表示
|
文字コードに関する処理を多様するなら、Python 2.xはお勧めできません。ややこしいし、3.xで大幅に変更されるので将来的に無駄知識になるので。
けど、あえてPython 2.xで。
バイト数を調べる方法が分からなかったので、unicodeオブジェクトに一度変換して、unicode文字1文字ずつ、文字コードにエンコードしなおして配列の大きさを見ています。かなり非効率だと思います。
timeコマンドではかったところ、香川県は0.270sでした
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | #!/usr/bin/python
# coding: utf-8
import sys
import codecs
def splitBytes(s, n, coding = 'utf-8'):
uni = unicode(s, coding)
dec = codecs.getdecoder(coding)
enc = codecs.getencoder(coding)
l = ['']
for c in uni:
if len(l[-1]) + len(enc(c)[0]) > n:
l.append(enc(c)[0])
else:
l[-1] += enc(c)[0]
return l
if __name__ == '__main__':
if len(sys.argv) == 1:
l = splitBytes("あいうえおabcdeかきくけこfghij", 10, "UTF-8")
for x in l: print x
l = splitBytes(u"あいうえおabcdeかきくけこfghij".encode("shift-jis"), 10, "shift-jis")
for x in l: print x.decode("shift-jis")
else:
l = splitBytes(open(sys.argv[1]).read(), 2000, "shift-jis")
for i,x in enumerate(l): print i, ":\n", x.decode("shift-jis"), "\n\n\n"
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 | #!/usr/bin/perl
use strict;
use warnings;
use utf8;
use Benchmark;
use Encode;
timethis(-5, sub {
open my $input, '<:encoding(Shift_JIS)', './37KAGAWA.CSV' or die $!;
splitBytes(do { local $/; <$input> }, '2000', 'UTF-8')
});
sub splitBytes {
my ($str, $byte_len, $enc_name) = @_;
my $enc = find_encoding($enc_name);
my $str_bytes = sub { use bytes; length shift };
my @results;
my $acc_encoded = '';
my $acc_bytes = 0;
for my $char (split //, $str) {
my $encoded = $enc->encode($char);
my $bytes = $str_bytes->($encoded);
if ($acc_bytes + $bytes <= $byte_len) {
$acc_encoded .= $encoded;
$acc_bytes += $bytes;
} else {
push @results, $acc_encoded;
$acc_encoded = $encoded;
$acc_bytes = $bytes;
}
}
push @results, $acc_encoded if $acc_bytes;
@results;
}
|
Ruby1.8.6です
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | $KCODE='u'
def splitBytes(str, split_size, encode)
require 'nkf'
case encode
when "Shift_JIS"
$KCODE='s'
str = NKF.nkf('-Ws', str)
when "UTF-8"
else
puts "bad encode"
exit
end
result = []
tmp = ""
total =0
ary = str.split(//)
ary.each do |e|
if (e.size + total <= split_size)
tmp << e
total += e.size
else
result << tmp
tmp = ""
total = 0
tmp << e
total += e.size
end
end
result << tmp
if ($KCODE == "SJIS")
$KCODE = 'u'
result.each do |e|
e = NKF.nkf('-Sw', e)
end
end
result
end
>> splitBytes("あいうえおabcdeかきくけこfghij", 10, "UTF-8")
=> ["あいう", "えおabcd", "eかきく", "けこfghi", "j"]
>> splitBytes("あいうえおabcdeかきくけこfghij", 10, "Shift_JIS")
=> ["\202\240\202\242\202\244\202\246\202\250", "abcde\202\251\202\253", "\202\255\202\257\202\261fghi", "j"]
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 | #!/usr/bin/perl
use strict;
use warnings;
use utf8;
use Benchmark;
use Encode;
open my $src_file, '<:encoding(Shift_JIS)', './37KAGAWA.CSV' or die $!;
my $src = do { local $/; <$src_file> };
my $start = Benchmark->new;
my @results = splitBytes($src, 2000, 'Shift_JIS');
print timestr(timediff(Benchmark->new, $start)), "\n";
sub takeBytes {
my ($str, $bytes_lim, $enc) = @_;
my $len = length $str;
my $bytes = length $enc->encode($str);
return ($str, '') if $bytes <= $bytes_lim;
return ('', $str) if $len == 1;
my $mid = int $len / 2;
my $lhs = substr($str, 0, $mid);
my $rhs = substr($str, $mid);
my ($head, $rest);
my $rest_bytes;
($head, $rest) = takeBytes($lhs, $bytes_lim, $enc);
return ($head, $rest . $rhs)
if $rest
or ($rest_bytes = $bytes_lim - length $enc->encode($lhs)) == 0;
($head, $rest) = takeBytes($rhs, $rest_bytes, $enc);
return ($lhs . $head, $rest);
}
sub splitBytes {
my ($str, $bytes_lim, $enc_name) = @_;
my $enc = find_encoding($enc_name);
my $taken;
my @results;
while ($str) {
($taken, $str) = takeBytes($str, $bytes_lim, $enc);
push @results, $taken;
}
return @results;
}
|
utf8限定 Data.ByteStringを使う
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | module Main where
import qualified Data.ByteString as B
import Data.List
import System.Environment
main :: IO ()
main = do args <- getArgs
mapM_ B.putStrLn . divid (read $ head $ args) =<< B.getContents
divid :: Int -> B.ByteString -> [B.ByteString]
divid n = unfoldr phi
where phi xs = if B.null xs then Nothing else Just (psi n xs)
psi :: Int -> B.ByteString -> (B.ByteString,B.ByteString)
psi n xs
= case B.splitAt n xs of
d0@(ys,zzs) -> case B.uncons zzs of
Nothing -> d0
Just (z,zs) -> if start z then d0
else psi (n-1) xs
where start z = z > 191 || z < 128
|
手を抜いて UTF 32 で。戻り値はバイトベクタのリストになります。
1 2 3 4 5 6 7 8 | (import (rnrs))
(define (split-bytes s n)
(let ((nc (div n 4))
(l (string-length s)))
(do ((i 0 (+ i nc))
(rs '() (cons (substring s i (min (+ i nc) l)) rs)))
((>= i l) (map string->utf32 (reverse rs))))))
|
なんか汚いですが。
香川県:10ms 全県:1.3秒 Pentium M 1.7GHz です。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | open System;
open System.Text;
open System.IO;
let splitBytes (s:string) (len:int) (enc:Encoding) =
let chars = s.ToCharArray() in
let next from =
let rec loop i bytes =
if i = s.Length then i
else
let bytes = bytes + enc.GetByteCount(chars, i, 1) in
if bytes > len then i
else loop (i + 1) bytes
in loop from 0
in
let rec loop i chunks =
let ii = next i in
let chunks = s.Substring(i, ii - i) :: chunks in
if ii = s.Length then List.rev chunks
else loop ii chunks
in loop 0 []
in
if Sys.argv.Length > 1 then
let start = Environment.TickCount in
let enc = Encoding.GetEncoding(Sys.argv.[1]) in
let data = Console.In.ReadToEnd() in
let tokens = splitBytes data 2000 enc in
List.iter (fun (x:string) -> Console.Out.WriteLine(x)) tokens;
Console.Error.WriteLine("{0} millis", Environment.TickCount - start)
|
どうも新しいバージョンのF#だとOCaml互換のSysモジュールが使えない?ようなので、ちょっと修正しました。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | #light "off"
open System;
open System.Text;
open System.IO;
let splitBytes (s:string) (len:int) (enc:Encoding) =
let chars = s.ToCharArray() in
let next from =
let rec loop i bytes =
if i = s.Length then i
else
let bytes = bytes + enc.GetByteCount(chars, i, 1) in
if bytes > len then i
else loop (i + 1) bytes
in loop from 0
in
let rec loop i chunks =
let ii = next i in
let chunks = s.Substring(i, ii - i) :: chunks in
if ii = s.Length then List.rev chunks
else loop ii chunks
in loop 0 []
in
let argv = Environment.GetCommandLineArgs() in
if argv.Length > 1 then
let start = Environment.TickCount in
let enc = Encoding.GetEncoding(argv.[1]) in
let data = Console.In.ReadToEnd() in
let tokens = splitBytes data 2000 enc in
List.iter (fun (x:string) -> Console.Out.WriteLine(x)) tokens;
Console.Error.WriteLine("{0} millis", Environment.TickCount - start)
|
Pentium M 1.30GHzでは以下の結果になりました。
UTF-8: 1082(ms) Shift_JIS: 2784(ms)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | import java.util.*;
import java.io.*;
class Kadai {
public static String[] splitBytes(String s, int l, String e) throws Exception {
List b = new ArrayList();
StringBuffer t = new StringBuffer();
for (int i = 0; i < s.length(); ++i) {
String u = s.substring(i,i+1);
if (t.toString().getBytes(e).length + u.getBytes(e).length > l) {
b.add(t.toString());
t.delete(0,t.length());
}
t.append(u);
}
b.add(t.toString());
return (String[])b.toArray(new String[0]);
}
public static void main(String[] args) throws Exception {
//System.out.println(splitBytes("あいうえおabcdeかきくけこfghij", 10, Charset.forName("Shift_JIS")));
//System.out.println(splitBytes("あいうえおabcdeかきくけこfghij", 10, Charset.forName("UTF-8")));
StringBuilder buf = new StringBuilder();
BufferedReader reader = null;
try {
reader = new BufferedReader(new InputStreamReader(new FileInputStream(".\\37KAGAWA.CSV"), "Shift_JIS"));
String line;
while ( ( line = reader.readLine() ) != null ) {
buf.append( line );
}
} finally {
if ( reader != null ) {
reader.close();
}
}
String str = buf.toString();
long start;
long elapse;
start = System.currentTimeMillis();
splitBytes(str, 2000, "UTF-8");
elapse = System.currentTimeMillis() - start;
System.out.println("UTF-8: " + elapse + "(ms)");
start = System.currentTimeMillis();
splitBytes(str, 2000, "Shift_JIS");
elapse = System.currentTimeMillis() - start;
System.out.println("Shift_JIS: " + elapse + "(ms)");
}
}
|
以下のコードで動作確認しました。
// テスト assert splitBytes("あいうえおabcdeかきくけこfghij", 10, "SJIS") == ["あいうえお", "abcdeかき", "くけこfghi", "j"] assert splitBytes("あいうえおabcdeかきくけこfghij", 10, "UTF-8") == ["あいう", "えおabcd", "eかきく", "けこfghi", "j"]
1 2 3 4 5 6 7 8 9 | def splitBytes( text, size, charset ){
def list = []
text.each{
if( list.empty || (list[-1] + it).getBytes(charset).size() > size )
list << ""
list[-1] += it
}
list
}
|
とりあえずお題の消化だけ。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | <?php
//ini_set('mbstring.internal_encoding', 'Shift_JIS');
function splitBytes($string, $length, $encode)
{
$string = mb_convert_encoding($string, $encode);
$local_encoding = mb_internal_encoding();
mb_internal_encoding($encode);
$out = array();
$tmp = "";
$tmp2 = "";
for($i = 0; $i < mb_strlen($string); $i++)
{
$tmp = mb_substr($string, $i, 1);
if(strlen($tmp2.$tmp) > $length)
{
$out[] = $tmp2;
$tmp2 = "";
}
$tmp2 .= $tmp;
}
$out[] = $tmp2;
foreach($out as $key => $val)
{
$out[$key] = mb_convert_encoding($val, $local_encoding);
}
mb_internal_encoding($local_encoding);
return $out;
}
$out = splitBytes("あいうえおabcdeかきくけこfghij", 10, "Shift_JIS");
var_dump($out);
$out = splitBytes("あいうえおabcdeかきくけこfghij", 10, "UTF-8");
var_dump($out);
|
JavaScriptでやることもないと思うけども、JavaScriptで。Pen4 2.5GHz + Firefox 3で香川県をやると、Shift_JISで110ms、UTF-8で150ms前後でした。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 | function splitBytes(src, len, encode){
var re = {
"Shift_JIS": [/[\u0000-\u007F\uFF61-\uFF9F]/g, /[\u0080-\uFF60\uFFA0-\uFFFF]/g],
"UTF-8": [/[\u0000-\u007F]/g, /[\u0080-\u07FF]/g, /[\u0800-\uFFFF]/g]
}[encode];
if (!(typeof src == "string") || !(typeof len == "number" && len > 0) || !re) return false;
var enc = src;
for (var i = 0; i < re.length; i++){
enc = enc.replace(re[i], Math.pow(10, i));
}
var startSrc = 0, startEnc = 0, tmp, splitLen, rslt = [];
while (startSrc != src.length){
tmp = enc.substr(startEnc, len + 1);
if (tmp.length > len) tmp = tmp.replace(/10*$/, "");
startEnc += tmp.length;
splitLen = tmp.replace(/0+/g, "").length;
rslt.push(src.substr(startSrc, splitLen));
startSrc += splitLen;
}
return rslt;
}
|
新しいリリースに追従しました。
http://clojure.org/lazy
とっても自由な感じですね!これを参考に、lazy-cons に変えて、 seq + lazy-seq + cons にしました。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | (defn splitBytes [string len encode]
(let [count-bytes (fn [cs] (-> (str cs) (.getBytes encode) count))
len1 (fn [cs]
(let [n (count cs)]
(loop [i 0, b 0, cs cs]
(let [blen (count-bytes (first cs))]
(if (and (< i n)
(<= (+ b blen) len))
(recur (inc i) (+ b blen) (rest cs))
i)))))
step (fn step [cs]
(lazy-seq
(when-let [s (seq cs)]
(let [[h t] (split-at (len1 s) s)]
(when h
(cons h (step t)))))))]
(step string)))
|
Turbo Delphiです。 全国一括で Shift-Jis : 90ms程度 utf-8 : 105ms程度 (Pen4 3.40GHz)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | program Doukaku234;
{$APPTYPE CONSOLE}
uses
Types, SysUtils, Classes, MMSystem;
type
TEncode = (eAnsi{=ShiftJis}, eUtf8);
function AnsiCharLen(AChar: PChar): Integer; inline;
begin
if AChar^ in LeadBytes then
Result := 2
else
Result := 1;
end;
function Utf8CharLen(AChar: PChar): Integer; inline;
begin
Result := 0;
while Byte(AChar^) shl Result and $80 <> 0 do
Inc(Result);
if Result = 0 then
Result := 1;
end;
function SplitBytes(const AString: string; const ALength: Integer; const AEncode: TEncode): TStringDynArray;
var
Count: Integer;
P, Q: PChar;
Len: Integer;
begin
Count := 0;
P := PChar(AString);
Q := P;
while P^ <> #0 do
begin
case AEncode of
eAnsi: Len := AnsiCharLen(P);
eUtf8: Len := Utf8CharLen(P);
end;
if P - Q + Len > ALength then
begin
SetLength(Result, Count + 1);
SetString(Result[Count], Q, P - Q);
Inc(Count);
Q := P;
end;
Inc(P, Len);
end;
SetLength(Result, Count + 1);
Result[Count] := Q;
end;
const
TestFilePath = '.\KEN_ALL.CSV';
FileSplitLen = 2000;
var
List: TStringDynArray;
AnsiText: AnsiString;
Utf8Text: UTF8String;
I: Integer;
T1, T2: DWORD;
begin
with TFileStream.Create(TestFilePath, fmOpenRead) do
try
SetLength(AnsiText, Size);
Read(AnsiText[1], Size);
finally
Free;
end;
Utf8Text := AnsiToUtf8(AnsiText);
Writeln('Ansi - SplitBytes(', TestFilePath, ') ->');
T1 := timeGetTime;
List := SplitBytes(AnsiText, FileSplitLen, eAnsi);
T2 := timeGetTime;
for I := 0 to Length(List) - 1 do
Writeln('>>', List[I], '<<');
Writeln('<- ', T2 - T1,'msec');
Writeln('Utf8 - SplitBytes(', TestFilePath, ') ->');
T1 := timeGetTime;
List := SplitBytes(Utf8Text, FileSplitLen, eUtf8);
T2 := timeGetTime;
for I := 0 to Length(List) - 1 do
Writeln('>>', Utf8ToAnsi(List[I]), '<<');
Writeln('<- ', T2 - T1,'msec');
end.
|
Scalaがまだの様なので。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | class Fold {
var w:Int = 0
var e:String = null
def this(e:String) = { this(); this.e = e }
def this(w:Int, e:String) = { this(e); this.w = w }
def fold(s:String):List[String] = s.foldLeft(List(""))((r,c) =>
(r.head + c) match {
case l if l.getBytes(e).size > w => c.toString::r
case l => l::r.tail
}
).reverse
}
object Fold {
def fold(s:String, w:Int, e:String):List[String] = new Fold(w, e).fold(s)
}
object SplitBytes {
def foldLines(f:Fold):Unit = readLine match {
case null =>
case s => {
println(f.fold(s).toString)
foldLines(f)
}
}
def main(args:Array[String]):Unit = args.length match {
case 2 =>
try {
foldLines(new Fold(args(0).toInt, args(1)))
} catch {
case e:NumberFormatException => println("バイト数の指定が不正です")
case e => e.printStackTrace
}
case _ => println("usage: SplitBytes BYTES ENCODING")
}
}
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | #include <iostream>
#include <vector>
#include <string>
#include <iterator>
#include <algorithm>
#include <cstdlib>
#if 1
inline std::string move(const std::string& s)
{
return s;
}
#else
using std::move;
#endif
// 1バイト目を渡すと、何バイト文字かを返す。
unsigned get_bytes(char c)
{
if (((unsigned char)((c ^ 0x20) - 0xA1) <= 0x3B))
{
return 2;
}
else
{
return 1;
}
}
template<typename Iterator>
void push_data(unsigned long splitBytes, std::vector<std::string>& buf, Iterator first, Iterator last)
{
buf.reserve(10000);
std::string s;
s.reserve(splitBytes);
for (; first != last; ++first)
{
char c = *first;
unsigned bytes = get_bytes(c);
if (splitBytes - s.size() < bytes)
{
buf.push_back(move(s));
s.clear();
s.reserve(splitBytes);
}
s += c;
for (unsigned i = 0; i < bytes - 1; ++i)
{
++first;
s += *first;
}
}
buf.push_back(move(s));
}
int main(int argc, char** argv)
{
if (argc <= 1)
{
return 1;
}
int splitBytes = std::strtoul(argv[1], 0, 10);
std::vector<std::string> buf;
push_data(splitBytes, buf, std::istreambuf_iterator<char>(std::cin), std::istreambuf_iterator<char>());
std::copy(buf.begin(), buf.end(), std::ostream_iterator<std::string>(std::cout, "\n"));
}
|





fumokmm #8443() [ Groovy ] Rating0/0=0.00
与えられた文字列を指定されたバイト数以下に分割する関数 splitBytes を書いてください。エンコーディングは実装側の 自由としますが、日本語対応は必須とします。 また、分割した結果の表現方法は、各言語で都合のよいものを 選択して下さい。(リスト,配列,改行区切りなど) [使用例] "あいうえおabcdeかきくけこfghij" -> 30バイト (Shift_JIS) "あいうえおabcdeかきくけこfghij" -> 40バイト (UTF-8) ★Shift_JISで10バイトで分割 splitBytes("あいうえおabcdeかきくけこfghij", 10, "Shift_JIS") => ["あいうえお", "abcdeかき", "くけこfghi", "j"] (10バイト, 9バイト, 10バイト, 1バイト) ★UTF-8で10バイトで分割 splitBytes("あいうえおabcdeかきくけこfghij", 10, "UTF-8") => ["あいう", "えおabcd", "eかきく", "けこfghi", "j"] (9バイト, 10バイト, 10バイト, 1バイト) ---補足------------------------------------------- 当処理は文字列をデータベースの 固定長フィールド(2000バイト)に投入できる最大サイズ ぎりぎりまでに分割する時の利用を考えています。 -------------------------------------------------- ●●● 余力のある方は以下も行ってみて下さい。 (1) 以下データを2000バイトずつに分割し、処理時間を計測する。 日本郵便 - 香川県の郵便番号(CSV形式:10,513Byte) http://www.post.japanpost.jp/zipcode/dl/kogaki/lzh/37kagawa.lzh ※ひとまず香川県が一番サイズが小さいので選びました。 自信ありの方はさらに大きいサイズのファイルでもチャレンジして みて下さい。(全国一括など) http://www.post.japanpost.jp/zipcode/dl/kogaki.html (2) 処理速度向上・コード短縮// サンプルコード (Groovy版) def splitBytes(str, len, encode) { result = [] while (str) { int stock = 0; for (i in 0..<str.size()){ if (str[0..i].getBytes(encode).size() <= len) stock = i else break } result << str[0..stock] str -= str[0..stock] } return result } println splitBytes("あいうえおabcdeかきくけこfghij", 10, "Shift_JIS") println splitBytes("あいうえおabcdeかきくけこfghij", 10, "UTF-8")Rating0/0=0.00-0+
[ reply ]