MODULE UnihanParser;
IMPORT
Streams, Files, Strings, UTF8Strings, TextUtilities, KernelLog;
CONST
MaxCode = 200000;
TYPE
String = Strings.String;
Character* = OBJECT
VAR
radical*, tradvariant*, simpvariant*,
additionalstrokes*, totalstrokes*, freq*, xfreq* : LONGINT;
mandarin*, pinyin*, cantonese*, korean*, definition* : String;
END Character;
TYPE
ListElement = POINTER TO RECORD
next : ListElement;
END;
CompareProc = PROCEDURE {DELEGATE} (a, b : ListElement) : LONGINT;
TYPE
PIMEEntry = POINTER TO RECORD(ListElement)
pinyin : ARRAY 8 OF CHAR;
ucs : LONGINT;
freq : LONGINT;
END;
VAR
codeArray : POINTER TO ARRAY OF Character;
nofLines : LONGINT;
tok : ARRAY 32 OF CHAR;
data : ARRAY 256 OF CHAR;
cap : ARRAY 256 OF CHAR;
isNum : ARRAY 256 OF BOOLEAN;
PROCEDURE GetHex(r : Streams.Reader) : LONGINT;
VAR c : CHAR; res : LONGINT;
BEGIN
res := 0;
c := r.Get();
WHILE (c >= "0") & (c <= "9") OR (CAP(c) >= "A") & (CAP(c) <= "F") DO
IF (c >= "0") & (c <= "9") THEN res := res * 16 + (ORD(c)-ORD("0"))
ELSE res := res * 16 + (ORD(CAP(c))-ORD("A") + 10)
END;
c := r.Get()
END;
RETURN res
END GetHex;
PROCEDURE HasCode*(code : LONGINT) : BOOLEAN;
BEGIN
RETURN (code >= 0) & (code < MaxCode) & (codeArray[code] # NIL)
END HasCode;
PROCEDURE GetCharacter*(code : LONGINT) : Character;
BEGIN
ASSERT( (code >= 0) & (code < MaxCode) );
IF (codeArray[code] = NIL) THEN NEW(codeArray[code]) END;
RETURN codeArray[code]
END GetCharacter;
PROCEDURE CountCharacters() : LONGINT;
VAR i, count : LONGINT;
BEGIN
count := 0;
FOR i := 0 TO MaxCode - 1 DO IF codeArray[i] # NIL THEN INC(count) END END;
RETURN count
END CountCharacters;
PROCEDURE ParseLine(r: Streams.Reader);
VAR code : LONGINT; c : CHAR; char : Character;
BEGIN
c := r.Get();
IF c = "U" THEN
r.SkipBytes(1);
code := GetHex(r);
char := GetCharacter(code);
r.SkipWhitespace;
r.Token(tok);
r.SkipWhitespace;
IF tok = "kDefinition" THEN
r.Ln(data);
char.definition := Strings.NewString(data)
ELSIF tok = "kMandarin" THEN
r.Ln(data);
char.mandarin := Strings.NewString(data)
ELSIF tok = "kFrequency" THEN
ELSIF tok = "kCantonese" THEN
r.Ln(data);
char.cantonese := Strings.NewString(data)
ELSIF tok = "kKorean" THEN
r.Ln(data);
char.korean := Strings.NewString(data)
ELSIF tok = "kSimplifiedVariant" THEN
r.Ln(data);
ELSIF tok = "kTraditionalVariant" THEN
r.Ln(data);
ELSE r.SkipLn;
END
ELSE r.SkipLn;
END;
INC(nofLines);
IF nofLines MOD 50000 = 0 THEN KernelLog.Int(nofLines, 5); KernelLog.String(" lines and counting..."); KernelLog.Ln END;
END ParseLine;
PROCEDURE AddRadicals;
VAR f : Files.File; r : Files.Reader;
code, count : LONGINT; char : Character;
BEGIN
f := Files.Old("Radicals.txt");
IF f # NIL THEN
KernelLog.String("Adding radical info..."); KernelLog.Ln;
Files.OpenReader(r, f, 0);
count := 0;
WHILE r.res = 0 DO
IF ~TextUtilities.GetUTF8Char(r, code) THEN HALT(1234) END; r.SkipWhitespace;
char := GetCharacter(code);
IF ~TextUtilities.GetUTF8Char(r, char.radical) THEN HALT(1234) END; r.SkipWhitespace;
r.Ln(data);
char.pinyin := Strings.NewString(data);
INC(count)
END;
KernelLog.String("Radical info to "); KernelLog.Int(count, 4); KernelLog.String(" characters added."); KernelLog.Ln
ELSE
KernelLog.String("Could not find radicals.txt"); KernelLog.Ln
END
END AddRadicals;
PROCEDURE AddFrequencies;
VAR f : Files.File; r : Files.Reader;
code, count, i : LONGINT; char : Character;
BEGIN
f := Files.Old("frequencies.txt");
IF f # NIL THEN
KernelLog.String("Adding frequency info..."); KernelLog.Ln;
count := 0;
Files.OpenReader(r, f, 0);
WHILE r.res = 0 DO
r.SkipLn; INC(count)
END;
Files.OpenReader(r, f, 0);
i := 0;
WHILE r.res = 0 DO
IF ~TextUtilities.GetUTF8Char(r, code) THEN HALT(1234) END; r.SkipWhitespace;
char := GetCharacter(code);
char.xfreq := ENTIER(99 * i / count) + 1;
r.SkipLn;
INC(i)
END;
FOR i := 0 TO MaxCode - 1 DO
IF codeArray[i] # NIL THEN
IF (codeArray[i].xfreq = 0) & (codeArray[i].freq # 0) THEN
codeArray[i].xfreq := (codeArray[i].freq - 1) * 20
END
END
END;
KernelLog.String("Frequency info added to "); KernelLog.Int(count, 4); KernelLog.String(" characters."); KernelLog.Ln
ELSE
KernelLog.String("Could not find frequencies.txt"); KernelLog.Ln
END
END AddFrequencies;
PROCEDURE ParseFile*;
VAR f : Files.File; r : Files.Reader;
BEGIN
f := Files.Old("Unihan.txt");
IF f # NIL THEN
Files.OpenReader(r, f, 0);
WHILE r.res = 0 DO ParseLine(r) END;
KernelLog.Int(nofLines, 5); KernelLog.String(" lines processed."); KernelLog.Ln;
KernelLog.Int(CountCharacters(), 5); KernelLog.String(" unique characters"); KernelLog.Ln;
AddRadicals;
AddFrequencies
ELSE KernelLog.String("Unihan.txt not found"); KernelLog.Ln
END;
END ParseFile;
PROCEDURE Sort(root: ListElement; compare : CompareProc);
VAR m, n, np, nq: LONGINT; p, q, tail: ListElement;
BEGIN
n := 1;
REPEAT
p := root.next; q := p; tail := root; m := 0;
WHILE p # NIL DO
np := 0;
REPEAT q := q.next; INC(np) UNTIL (q = NIL) OR (np = n);
nq := n; INC(m);
LOOP
IF (np # 0) & ((nq = 0) OR (q = NIL) OR (compare(p, q) <= 0)) THEN
tail.next := p; tail := p; p := p.next; DEC(np)
ELSIF (nq # 0) & (q # NIL) THEN
tail.next := q; tail := q; q := q.next; DEC(nq)
ELSE
EXIT
END
END;
tail.next := NIL; p := q
END;
n := n*2
UNTIL m <= 1
END Sort;
PROCEDURE ComparePinyinFreq(a, b: ListElement) : LONGINT;
BEGIN
IF a(PIMEEntry).pinyin < b(PIMEEntry).pinyin THEN RETURN -1 END;
IF a(PIMEEntry).pinyin > b(PIMEEntry).pinyin THEN RETURN 1 END;
IF a(PIMEEntry).freq > b(PIMEEntry).freq THEN RETURN -1 END;
IF a(PIMEEntry).freq < b(PIMEEntry).freq THEN RETURN 1 END;
RETURN 0
END ComparePinyinFreq;
PROCEDURE BuildBasicPinyinIMETable*;
VAR i, words : LONGINT;
list : ListElement;
cur : ListElement;
str : ARRAY 16 OF CHAR;
f : Files.File;
w : Files.Writer;
PROCEDURE AddEntry(pinyin : ARRAY OF CHAR; code, freq : LONGINT);
VAR e : PIMEEntry;
BEGIN
NEW(e);
INC(words);
COPY(pinyin, e.pinyin); e.ucs := code; e.freq := freq; e.next := list.next;
list.next := e
END AddEntry;
PROCEDURE FixBugs(pinyin : ARRAY OF CHAR; code, freq : LONGINT);
VAR i, l : LONGINT; t : ARRAY 100 OF CHAR;
BEGIN
Strings.Trim(pinyin, " ");
l := Strings.Length(pinyin);
i := 1; WHILE i < l DO
IF (isNum[ORD(pinyin[i])]) & (pinyin[i + 1] # 0X) THEN
Strings.Copy(pinyin, 0, i + 1, t);
AddEntry(t, code, freq);
Strings.Delete(pinyin, 0, i + 1);
i := 1; l := Strings.Length(pinyin);
ELSE INC(i)
END
END;
IF pinyin # "" THEN
AddEntry(pinyin, code, freq)
END
END FixBugs;
PROCEDURE DumpChar(i : LONGINT; c : Character);
VAR p, t : ARRAY 256 OF CHAR;
BEGIN
IF c.mandarin = NIL THEN
KernelLog.String("unexpected : no mandarininfo "); KernelLog.String("i = "); KernelLog.Int(i, 0); KernelLog.Ln;
RETURN
END;
COPY(c.mandarin^, p);
Strings.Trim(p, " ");
WHILE Strings.Length(p) > 1 DO
IF Strings.Pos(" ", p) >= 0 THEN
Strings.Copy(p, 0, Strings.Pos(" ", p), t);
FixBugs(t, i, c.xfreq);
Strings.Delete(p, 0, Strings.Pos(" ", p));
Strings.Trim(p, " ");
ELSE
FixBugs(p, i, c.xfreq);
p := ""
END
END
END DumpChar;
BEGIN
NEW(list);
words := 0;
KernelLog.String("creating list...");
FOR i := 0 TO MaxCode - 1 DO
IF codeArray[i] # NIL THEN
IF codeArray[i].xfreq > 0 THEN
DumpChar(i, codeArray[i])
END
END
END;
KernelLog.String("done."); KernelLog.Ln;
KernelLog.String("sorting");
Sort(list, ComparePinyinFreq);
KernelLog.String("done."); KernelLog.Ln;
KernelLog.String("Writing PinyinIMETable.txt");
f := Files.New("PinyinIMETable.txt");
Files.OpenWriter(w, f, 0);
w.Int(words, 0); w.Ln;
cur := list.next;
WHILE cur # NIL DO
w.String(cur(PIMEEntry).pinyin); w.Char(09X);
i := 0; IF UTF8Strings.EncodeChar(cur(PIMEEntry).ucs, str, i) THEN END;
w.String(str); w.Char(09X);
w.Int(cur(PIMEEntry).freq, 0);
w.Ln;
INC(words);
cur := cur.next
END;
w.Update;
Files.Register(f);
KernelLog.String("done."); KernelLog.Ln;
END BuildBasicPinyinIMETable;
PROCEDURE InitLookup;
VAR i : LONGINT;
BEGIN
FOR i := 0 TO 255 DO cap[i] := CHR(i) END;
FOR i := ORD("a") TO ORD("z") DO cap[i] := CHR(i - 32) END;
FOR i := 0 TO 255 DO isNum[i] := FALSE END;
FOR i := ORD("0") TO ORD("9") DO isNum[i] := TRUE END;
END InitLookup;
BEGIN
InitLookup;
NEW(codeArray, MaxCode);
ParseFile;
END UnihanParser.
SystemTools.Free UnihanParser ~
UnihanParser.ParseFile ~
PET.Open PinyinIMETable.txt
UnihanParser.BuildBasicPinyinIMETable ;