MODULE UTF8Strings;
CONST
CmpLess* = -1; CmpEqual* = 0; CmpGreater* = 1; CmpError* = 2;
VAR
CodeLength-: ARRAY 256 OF CHAR;
init: LONGINT;
PROCEDURE EncodeChar*(ucs: LONGINT; VAR str: ARRAY OF CHAR; VAR i: LONGINT): BOOLEAN;
VAR len, j: LONGINT; byte, mask, max: INTEGER;
buf: ARRAY 6 OF CHAR;
BEGIN
len := LEN(str);
IF (ucs <= 7FH) THEN
IF (i + 1 < len) THEN str[i] := CHR(SHORT(ucs));
str[i+1] := 0X;
INC(i)
ELSE RETURN FALSE
END
ELSE
byte := 0; mask := 7F80H; max := 3FH;
WHILE (ucs > max) DO
buf[byte] := CHR(80H + SHORT(ucs MOD 40H)); INC(byte);
ucs := ucs DIV 64;
mask := mask DIV 2;
max := max DIV 2;
END;
buf[byte] := CHR(mask + SHORT(ucs));
IF (i + byte + 1 < len) THEN
FOR j := 0 TO byte DO str[i + j] := buf[byte - j] END;
str[i+byte+1] := 0X;
i := i + byte + 1
ELSE RETURN FALSE
END
END;
RETURN TRUE
END EncodeChar;
PROCEDURE DecodeChar*(CONST str: ARRAY OF CHAR; VAR i, ucs: LONGINT): BOOLEAN;
VAR len, ch, min: LONGINT;
BEGIN
IF i < LEN(str) THEN
ch := LONG(ORD(str[i]));
IF ch < 80H THEN
ucs := ch; INC(i);
RETURN TRUE
ELSE
CASE CodeLength[ch] OF
2X: ucs := ch MOD 20H; len := 2; min := 80H
|3X: ucs := ch MOD 10H; len := 3; min := 800H
|4X: ucs := ch MOD 8; len := 4; min := 10000H
|5X: ucs := ch MOD 4; len := 5; min := 200000H
|6X: ucs := ch MOD 2; len := 6; min := 4000000H
ELSE RETURN FALSE
END;
LOOP
INC(i); DEC(len);
IF len = 0 THEN RETURN ucs >= min END;
IF i = LEN(str) THEN EXIT END;
ch := LONG(ORD(str[i]));
IF ASH(ch, -6) # 2 THEN EXIT END;
ucs := ASH(ucs, 6) + ch MOD 40H
END
END
END;
RETURN FALSE
END DecodeChar;
PROCEDURE Valid*(CONST str: ARRAY OF CHAR): BOOLEAN;
VAR i, ucs: LONGINT;
BEGIN
i := 0;
WHILE DecodeChar(str, i, ucs) DO
IF ucs = 0 THEN RETURN TRUE END
END;
RETURN FALSE
END Valid;
PROCEDURE Size*(CONST str: ARRAY OF CHAR): LONGINT;
VAR i: LONGINT;
BEGIN
i := 0; WHILE str[i] # 0X DO INC(i) END;
RETURN i
END Size;
PROCEDURE SetSize*(VAR str: ARRAY OF CHAR; size: LONGINT);
VAR i: LONGINT;
BEGIN
IF size > LEN(str)-1 THEN size := LEN(str)-1 END;
IF size > 0 THEN
i := size-1;
IF str[i] >= 80X THEN
WHILE ASH(LONG(ORD(str[i])), -6) = 2 DO DEC(i) END;
IF i + ORD(CodeLength[ORD(str[i])]) > size THEN size := i END
END
END;
str[size] := 0X
END SetSize;
PROCEDURE OffsetOfIndex*(CONST str: ARRAY OF CHAR; idx: LONGINT): LONGINT;
VAR i, ch: LONGINT;
BEGIN
i := 0;
LOOP
IF idx <= 0 THEN EXIT END;
ch := LONG(ORD(str[i]));
IF ch = 0 THEN EXIT END;
DEC(idx); INC(i, LONG(ORD(CodeLength[ch])))
END;
RETURN i
END OffsetOfIndex;
PROCEDURE Length*(CONST str: ARRAY OF CHAR): LONGINT;
VAR i, len, ch: LONGINT;
BEGIN
i := 0; len := 0;
LOOP
ch := LONG(ORD(str[i]));
IF ch = 0 THEN EXIT END;
INC(i, LONG(ORD(CodeLength[ch])));
INC(len)
END;
RETURN len
END Length;
PROCEDURE Assign*(CONST src: ARRAY OF CHAR; VAR dst: ARRAY OF CHAR);
VAR i: LONGINT; ch: CHAR;
BEGIN
i := 0;
LOOP
ch := src[i];
IF (ch = 0X) OR (i = LEN(dst)) THEN EXIT END;
dst[i] := ch; INC(i)
END;
SetSize(dst, i)
END Assign;
PROCEDURE Extract*(CONST src: ARRAY OF CHAR; idx, num: LONGINT; VAR dst: ARRAY OF CHAR);
VAR i, j: LONGINT; ch: CHAR;
BEGIN
i := OffsetOfIndex(src, idx); j := 0;
LOOP
ch := src[i];
IF (ch = 0X) OR (j = LEN(dst)) THEN EXIT END;
IF ASH(LONG(ORD(ch)), -6) # 2 THEN DEC(num) END;
IF num < 0 THEN EXIT END;
dst[j] := ch; INC(i); INC(j)
END;
SetSize(dst, j)
END Extract;
PROCEDURE Delete*(VAR str: ARRAY OF CHAR; idx, num: LONGINT);
VAR i, j: LONGINT; ch: CHAR;
BEGIN
i := OffsetOfIndex(str, idx); j := i;
LOOP
ch := str[j];
IF (num <= 0) OR (ch = 0X) THEN EXIT END;
INC(j, ORD(CodeLength[ORD(ch)]));
DEC(num)
END;
LOOP
str[i] := ch;
IF ch = 0X THEN EXIT END;
INC(i); INC(j);
ch := str[j]
END
END Delete;
PROCEDURE Append*(CONST src: ARRAY OF CHAR; VAR dst: ARRAY OF CHAR);
VAR i, j: LONGINT; ch: CHAR;
BEGIN
i := 0; j := Size(dst);
LOOP
ch := src[i];
IF (ch = 0X) OR (j = LEN(dst)) THEN EXIT END;
dst[j] := ch; INC(i); INC(j)
END;
SetSize(dst, j)
END Append;
PROCEDURE Concat*(CONST src1, src2: ARRAY OF CHAR; VAR dst: ARRAY OF CHAR);
BEGIN
Assign(src1, dst); Append(src2, dst)
END Concat;
PROCEDURE Insert*(CONST src: ARRAY OF CHAR; idx: LONGINT; VAR dst: ARRAY OF CHAR);
VAR i, j, m, n: LONGINT; ch: CHAR;
BEGIN
n := Size(src); m := Size(dst);
j := OffsetOfIndex(dst, idx); i := m-1;
WHILE i >= j DO
IF i+n < LEN(dst) THEN dst[i+n] := dst[i] END;
DEC(i)
END;
i := 0;
LOOP
ch := src[i];
IF (ch = 0X) OR (j = LEN(dst)) THEN EXIT END;
dst[j] := ch; INC(i); INC(j)
END;
SetSize(dst, m+n)
END Insert;
PROCEDURE UTF8toASCII*(CONST src: ARRAY OF CHAR; substitute: CHAR; VAR dst: ARRAY OF CHAR): LONGINT;
VAR count, i, len, pos, ucs: LONGINT;
BEGIN
len := LEN(dst); ucs := -1;
WHILE (ucs # 0) & DecodeChar(src, pos, ucs) & (i < len) DO
IF (ucs >= 0) & (ucs < 100H) THEN dst[i] := CHR(ucs); INC(i)
ELSIF (substitute # 0X) THEN dst[i] := substitute; INC(i); INC(count)
END
END;
RETURN count
END UTF8toASCII;
PROCEDURE ASCIItoUTF8*(CONST ascii: ARRAY OF CHAR; VAR utf8: ARRAY OF CHAR);
VAR i,j: LONGINT; dummy: BOOLEAN;
BEGIN
i := 0; j := 0;
WHILE (ascii[i] # 0X) & EncodeChar(ORD(ascii[i]), utf8, j) DO INC(i) END;
dummy := EncodeChar(0, utf8, j)
END ASCIItoUTF8;
PROCEDURE UTF8toUnicode*(CONST utf8: ARRAY OF CHAR; VAR ucs: ARRAY OF LONGINT; VAR idx: LONGINT);
VAR p, l: LONGINT;
BEGIN
p := 0; l := LEN(ucs)-1;
WHILE (idx < l) & DecodeChar(utf8, p, ucs[idx]) & (ucs[idx] > 0) DO INC(idx) END;
ucs[idx] := 0; INC(idx)
END UTF8toUnicode;
PROCEDURE UnicodetoUTF8*(CONST ucs: ARRAY OF LONGINT; VAR utf8: ARRAY OF CHAR);
VAR b: BOOLEAN; i, p, l: LONGINT;
BEGIN
b := TRUE; i := 0; p := 0; l := LEN(ucs);
WHILE (i < l) & b DO
b := EncodeChar(ucs[i], utf8, p);
INC(i)
END
END UnicodetoUTF8;
PROCEDURE UpperCase*(CONST src: ARRAY OF CHAR; VAR dst: ARRAY OF CHAR);
VAR i: LONGINT;
BEGIN
WHILE (src[i] # 0X) DO
IF (src[i] >= "a") & (src[i] <= "z") THEN dst[i] := CAP(src[i])
ELSE dst[i] := src[i]
END;
INC(i)
END;
dst[i] := 0X
END UpperCase;
PROCEDURE CanAssign*(CONST src, dst : ARRAY OF CHAR): BOOLEAN;
BEGIN
RETURN Size(src)+1 <= LEN(dst)
END CanAssign;
PROCEDURE CanExtract*(CONST src: ARRAY OF CHAR; idx, num: LONGINT; CONST dst: ARRAY OF CHAR): BOOLEAN;
BEGIN
HALT(99)
END CanExtract;
PROCEDURE CanDelete*(CONST str: ARRAY OF CHAR; idx, num: LONGINT);
BEGIN
HALT(99)
END CanDelete;
PROCEDURE CanAppend*(CONST src: ARRAY OF CHAR; CONST dst: ARRAY OF CHAR): BOOLEAN;
BEGIN
HALT(99)
END CanAppend;
PROCEDURE CanConcat*(CONST src1, src2, dst : ARRAY OF CHAR): BOOLEAN;
BEGIN
HALT(99)
END CanConcat;
PROCEDURE CanInsert*(CONST src: ARRAY OF CHAR; idx: LONGINT; CONST dst: ARRAY OF CHAR): BOOLEAN;
BEGIN
HALT(99)
END CanInsert;
PROCEDURE Compare*(CONST str1, str2: ARRAY OF CHAR): LONGINT;
VAR i: LONGINT; ch: CHAR;
BEGIN
i := 0;
LOOP
ch := str1[i];
IF ch # str2[i] THEN EXIT END;
IF ch = 0X THEN RETURN CmpEqual END;
INC(i)
END;
IF ch < str2[i] THEN RETURN CmpLess ELSE RETURN CmpGreater END
END Compare;
PROCEDURE CompareToUnicode*(CONST utf8 : ARRAY OF CHAR; CONST unicode : ARRAY OF LONGINT) : LONGINT;
VAR len, i, j, char, ucs, result : LONGINT; valid, abort : BOOLEAN;
BEGIN
len := LEN(unicode);
i := 0; j := 0; valid := TRUE; abort := FALSE;
WHILE valid & ~abort & (j < len) DO
valid := DecodeChar(utf8, i, char);
ucs := unicode[j];
abort := (char # ucs) OR (char = 0);
INC(j);
END;
IF valid THEN
IF (char = 0) & (ucs = 0) THEN result := CmpEqual;
ELSIF (char < ucs) THEN result := CmpLess;
ELSE result := CmpGreater;
END;
ELSE
result := CmpError;
END;
RETURN result;
END CompareToUnicode;
PROCEDURE FindNext*(CONST pat, str: ARRAY OF CHAR; startidx: LONGINT; VAR found: BOOLEAN; VAR patidx: LONGINT);
BEGIN
HALT(99)
END FindNext;
PROCEDURE FindPrev*(CONST pat, str: ARRAY OF CHAR; startidx: LONGINT; VAR found: BOOLEAN; VAR patidx: LONGINT);
BEGIN
HALT(99)
END FindPrev;
PROCEDURE FindDiff*(CONST str1, str2: ARRAY OF CHAR; VAR different: BOOLEAN; VAR idx: LONGINT);
VAR i, j: LONGINT; ch: CHAR;
BEGIN
i := 0; j := -1;
LOOP
ch := str1[i];
IF ASH(LONG(ORD(ch)), -6) # 2 THEN INC(j) END;
IF ch # str2[i] THEN
different := TRUE; idx := j;
EXIT
END;
IF ch = 0X THEN
different := FALSE;
EXIT
END;
INC(i)
END
END FindDiff;
BEGIN
FOR init := 0 TO 7FH DO CodeLength[init] := 1X END;
FOR init := 80H TO 0BFH DO CodeLength[init] := 7X END;
FOR init := 0C0H TO 0DFH DO CodeLength[init] := 2X END;
FOR init := 0E0H TO 0EFH DO CodeLength[init] := 3X END;
FOR init := 0F0H TO 0F7H DO CodeLength[init] := 4X END;
FOR init := 0F8H TO 0FBH DO CodeLength[init] := 5X END;
FOR init := 0FCH TO 0FDH DO CodeLength[init] := 6X END
END UTF8Strings.
(**
Notes:
This module manages UCS-32 0X-terminated character strings encoded as multi-byte UTF-8 strings. The UTF-8 encoding is decribed in RFC2279.
A CHAR value in a UTF-8 string can have one of three roles. First, it can be a 7-bit ASCII character directly encoded as one byte. Second, it can be the starting byte of a UCS-32 character encoded as 2 to 6 bytes. Third, it can be a non-starting byte of a UCS-32 character encoded as 2 to 6 bytes.
The role of a CHAR ch is encoded in its top two bits, as follows:
ASH(ORD(ch), -6) < 2, role is ASCII character (can also test ch < 80X).
ASH(ORD(ch), -6) = 2, role is non-starting byte of a multi-byte sequence.
ASH(ORD(ch), -6) = 3, role is starting byte of a multi-byte sequence.
The CodeLength string can be used to find the length of an encoding. Assuming ch is the starting byte of an encoding, ORD(CodeLength[ORD(ch)]) is the total number of bytes in the encoding. If ch is not a starting byte, this expression will return 7, indicating an error.
All string input parameters (except in DecodeChar, Valid and SetSize) are assumed to be 0X-terminated, well-formed UTF-8 strings. All string output parameters produced are also 0X-terminated, well-formed UTF-8 strings. It is assumed that the LEN of all ARRAY OF CHAR parameters is positive. Violations of these assumptions may cause run-time exceptions, but not endless loops or memory corruption.
In a secure network application, UTF-8 strings received over the network MUST first be validated, and only used if found to be valid. The reason is some invalid encodings can be used to code characters in alternate ways, which may bypass security checks, or cause other problems. See the RFC for more details.
All the procedures truncate the destination string at a UTF-8 character boundary if enough space is not available. A CanX function can be used to check whether truncation will occur if operation X is performed with the specified parameters.
String constants in Oberon programs are not necessarily well-formed UTF-8 strings, unless they contain only ASCII characters (below 80X).
The Oberon built-in procedure COPY does not necessarily produce a well-formed UTF-8 string, because it can truncate the destination string in the middle of a multi-byte character. Rather use the Assign procedure from this module.
The Oberon string comparison operators <, <=, =, >=, > can be used on UTF-8 strings for lexical comparisons.
*)
(*
o assume indexes are inside string? except lengths
*)
Backup.WriteFiles UTF8Strings.Mod ~