matarillo #3530(2007/10/26 10:32 GMT) [ C# ] Rating0/0=0.00
わりとまじめに書いたので長くなりました。 ファイルアクセスがすこしでも少なくなるようにSeekすることにしましたが、どんなもんでしょうか。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
using System; using System.Collections.Generic; using System.Text; using System.IO; class Program { static Encoding enc = Encoding.Default; static void Main(string[] args) { IDictionary<int, List<long>> hashtable = new Dictionary<int, List<long>>(); SortedList<long, List<long>> collision = new SortedList<long, List<long>>(); using (FileStream fs = new FileStream(args[0], FileMode.Open, FileAccess.Read)) { using (BufferedStream bs = new BufferedStream(fs)) { // 重複候補の位置をリストに保存 long currentPosition = 0L; byte[] bytes = null; while ((bytes = ReadLine(bs)).Length > 0) { string line = enc.GetString(bytes); int hash = line.GetHashCode(); List<long> list = null; if (!hashtable.ContainsKey(hash)) { list = new List<long>(); hashtable.Add(hash, list); } else { list = hashtable[hash]; if (list.Count == 1) collision.Add(list[0], list); collision.Add(currentPosition, list); } list.Add(currentPosition); currentPosition += bytes.Length; } // 重複候補の値を調べ、末尾側に重複があるか検証 currentPosition = 0L; bs.Position = currentPosition; for (int baseIndex = 0; baseIndex < collision.Count; baseIndex++) { long basePosition = collision.Keys[baseIndex]; bs.Seek(basePosition - currentPosition, SeekOrigin.Current); currentPosition = basePosition; byte[] baseBytes = ReadLine(bs); currentPosition += baseBytes.Length; string baseString = enc.GetString(baseBytes); List<long> targetList = collision.Values[baseIndex]; bool found = false; for (int targetIndex = 0; targetIndex < targetList.Count; targetIndex++) { long targetPosition = targetList[targetIndex]; if (targetPosition <= basePosition) continue; bs.Seek(targetPosition - currentPosition, SeekOrigin.Current); currentPosition = targetPosition; byte[] targetBytes = ReadLine(bs); currentPosition += targetBytes.Length; string targetString = enc.GetString(targetBytes); found = baseString.Equals(targetString); if (found) break; } if (!found) collision.RemoveAt(baseIndex--); } // 出力 currentPosition = 0L; bs.Position = currentPosition; while ((bytes = ReadLine(bs)).Length > 0) { if (!collision.ContainsKey(currentPosition)) Console.Write(enc.GetString(bytes)); currentPosition += bytes.Length; } } } } static byte[] ReadLine(Stream s) { byte[] bytes = enc.GetBytes(Environment.NewLine); List<byte> line = new List<byte>(); int b = 0; while ((b = s.ReadByte()) >= 0) { line.Add((byte)b); if (line.Count >= bytes.Length) { bool newline = false; for (int i = 0; i < bytes.Length; i++) { newline = (line[line.Count - bytes.Length + i] == bytes[i]); if (!newline) break; } if (newline) return line.ToArray(); } } return line.ToArray(); } }
Rating0/0=0.00-0+
[ reply ]
matarillo
#3530()
[
C#
]
Rating0/0=0.00
Rating0/0=0.00-0+
[ reply ]