MODULE UnicodeProperties; (** AUTHOR "gubsermi"; PURPOSE "Reading the Unicode.txt file and interpreting the properties"; *)

IMPORT
	Texts, Codecs, Files, Streams, KernelLog, Strings;

CONST
	NUL*	= 00H;
	EOT*	= 04H;
	LF*		= 0AH;
	CR*		= 0DH;
	SP*		= 20H;

	CacheDebugging = FALSE;


VAR
	error- : BOOLEAN;



TYPE
	(* Caches a result from the property files. Can be used for a string or a character value, but not both! *)
	CacheElement = OBJECT
	VAR
		next : CacheElement;
		key : Texts.Char32;
		sValue : ARRAY 256 OF CHAR;
		cValue : Texts.Char32;

		(* Initializes a CacheElement with a key and a value *)
		PROCEDURE &Init*(key : Texts.Char32; CONST sValue : ARRAY OF CHAR; cValue : Texts.Char32);
		BEGIN
			SELF.key := key;
			IF sValue[0] # CHR(0H) THEN
				Strings.Copy(sValue,0,LEN(sValue),SELF.sValue);
				SELF.cValue := -1;
			ELSE
				SELF.sValue[0] := CHR(0H);
				SELF.cValue := cValue;
			END;
		END Init;

	END CacheElement;


	(* The Property Cache uses a Hashmap of a specific size to cache either string or character properties. *)
	CharacterPropertyCache = OBJECT
	VAR
		internalCache : POINTER TO ARRAY OF CacheElement;
		cacheSize : LONGINT;

		(* Initializes the hashmap with a specific size *)
		PROCEDURE &Init*(size : LONGINT);
		BEGIN
			cacheSize := size;
			NEW(internalCache,cacheSize);
		END Init;

		(* Searches the cache for a specific key and returns the corresponding string entry *)
		PROCEDURE SLookup(char : Texts.Char32; VAR res : ARRAY OF CHAR);
		VAR
			bucket : LONGINT;
			currentElement : CacheElement;
		BEGIN
			(* get the bucket where the element resides if available *)
			bucket := char MOD cacheSize;
			currentElement := internalCache[bucket];

			(* search the linked list for the entry *)
			WHILE currentElement # NIL DO
				IF currentElement.key = char THEN
					IF CacheDebugging THEN
						KernelLog.String("found: "); KernelLog.Hex(currentElement.key,4);
						KernelLog.String(" ("); KernelLog.String(currentElement.sValue);
						KernelLog.String(")"); KernelLog.Ln;
					END;
					Strings.Copy(currentElement.sValue,0,LEN(res),res);
					RETURN;
				ELSE
					currentElement := currentElement.next;
				END;
			END;

			(* clear the result if nothing was found. *)
			res := "";
		END SLookup;

		(* Searches the cache for a specific key and returns the corresponding character entry *)
		PROCEDURE CLookup(char : Texts.Char32) : Texts.Char32;
		VAR
			bucket : LONGINT;
			currentElement : CacheElement;
		BEGIN
			(* get the bucket where the element resides if available *)
			bucket := char MOD cacheSize;
			currentElement := internalCache[bucket];

			(* search the linked list for the entry *)
			WHILE currentElement # NIL DO
				IF currentElement.key = char THEN
					IF CacheDebugging THEN
						KernelLog.String("found: "); KernelLog.Hex(currentElement.key,4);
						KernelLog.String(" ("); KernelLog.Hex(currentElement.cValue,4);
						KernelLog.String(")"); KernelLog.Ln;
					END;
					RETURN currentElement.cValue;
				ELSE
					currentElement := currentElement.next;
				END;
			END;

			(* return a 'fault code' if nothing was found *)
			RETURN -1
		END CLookup;


		(* Inserts a new string entry for a given key. *)
		PROCEDURE SInsert(char : Texts.Char32; CONST value : ARRAY OF CHAR);
		VAR
			newElement : CacheElement;
			bucket : LONGINT;
		BEGIN
			NEW(newElement,char,value,-1);

			(* insert the new entry at the first position of the correct bucket *)
			bucket := char MOD cacheSize;
			newElement.next := internalCache[bucket];
			internalCache[bucket] := newElement;

			IF CacheDebugging THEN
				KernelLog.String("inserted: "); KernelLog.Hex(char,4);
				KernelLog.String(" (");
				KernelLog.String(value);
				KernelLog.String(")"); KernelLog.Ln;
			END;
		END SInsert;

		(* Inserts a new character entry for a given key. *)
		PROCEDURE CInsert(char : Texts.Char32; value : Texts.Char32);
		VAR
			newElement: CacheElement;
			bucket : LONGINT;
			dummy : ARRAY 1 OF CHAR;
		BEGIN
			dummy[0] := CHR(0H);
			NEW(newElement,char,dummy,value);

			(* insert the new entry at the first position of the correct bucket *)
			bucket := char MOD cacheSize;
			newElement.next := internalCache[bucket];
			internalCache[bucket] := newElement;

			IF CacheDebugging THEN
				KernelLog.String("inserted: "); KernelLog.Hex(char,4);
				KernelLog.String(" ("); KernelLog.Hex(value,4);
				KernelLog.String(")"); KernelLog.Ln;
			END;
		END CInsert;

		(* Prints the whole cache to the console *)
		PROCEDURE Print;
		VAR
			i : LONGINT;
			thisElement : CacheElement;
		BEGIN
			FOR i := 0 TO cacheSize - 1 DO
				thisElement := internalCache[i];
				KernelLog.Int(i,3); KernelLog.String(": ");
				WHILE thisElement # NIL DO
					KernelLog.Int(thisElement.key,4); KernelLog.String(" (");
					IF thisElement.cValue = -1 THEN KernelLog.String(thisElement.sValue) END;
					KernelLog.String(") -> ");
					thisElement := thisElement.next;
				END;
				KernelLog.Ln;
			END;
		END Print;

	END CharacterPropertyCache;

	(* A handy implementation for text file reading and analyzation. Basic functionality is provided. TxtReaders that
	    handle a specific text layout, should inherit this class and (re-)implement necessary procedures. *)
	TxtReader = OBJECT
	VAR
		filename : ARRAY 256 OF CHAR;
		text : Texts.Text;
		textReader : Texts.TextReader;
		startPos : LONGINT;
		decoder : Codecs.TextDecoder;
		msg : ARRAY 512 OF CHAR;
		fullname : ARRAY 256 OF CHAR;
		file : Files.File;
		in: Streams.Reader;
		decoderRes : LONGINT;

		(* loads a file into a local Text and creates an associated TextReader *)
		PROCEDURE LoadTxtFile;
		BEGIN
			error := FALSE;
			COPY(filename, fullname);

			(* Check whether file exists and get its canonical name *)
			file := Files.Old(filename);
			IF (file # NIL) THEN
				file.GetName(fullname);
			ELSE
				file := Files.New(filename); (* to get path *)
				IF (file # NIL) THEN
					file.GetName(fullname);
					file := NIL;
				END;
			END;

			IF (file # NIL) THEN
				decoder := Codecs.GetTextDecoder("ISO8859-1");

				IF (decoder # NIL) THEN
					in := Codecs.OpenInputStream(fullname);
					IF in # NIL THEN
						decoder.Open(in, decoderRes);
						IF decoderRes = 0 THEN
							text := decoder.GetText();
							NEW(textReader,text);
						END;
					ELSE
						msg := "Can't open input stream on file "; Strings.Append(msg, fullname);
						KernelLog.String(msg);
						error := TRUE;
					END;
				ELSE
					msg := "No decoder for file "; Strings.Append(msg, fullname);
					Strings.Append(msg, " (Format: "); Strings.Append(msg, "ISO8859-1"); Strings.Append(msg, ")");
					KernelLog.String(msg);
					error := TRUE;
				END;
			ELSE
				msg := "file '"; Strings.Append(msg, fullname); Strings.Append(msg,"' not found.");
				KernelLog.String(msg);
				error := TRUE;
			END;
			FindStartPos;
		END LoadTxtFile;

		(* Abstract procedure to be overwritten by the children of TxtReader *)
		PROCEDURE FindStartPos;
		BEGIN
			HALT (999);
		END FindStartPos;

		(* Skips a whole line of the file *)
		PROCEDURE NextLine;
		VAR
			thisChar : Texts.Char32;
		BEGIN
			IF textReader = NIL THEN RETURN END;

			(* read the characters until the end of the line is reached *)
			REPEAT
				textReader.ReadCh(thisChar);
			UNTIL ((thisChar = LF) OR (thisChar = CR));
		END NextLine;

	END TxtReader;


TYPE

	(* TxtReader to read the UnicodeData.txt file. So far there's direct support for the bidi character type and the
	'mirrored' property. More explicit lookups can easily be added later on. *)
	UnicodeTxtReader*=OBJECT(TxtReader)
	VAR
		(* For each property that is explicitly needed, a cache is used. Whenever a new property is needed often,
		feel free to add another cache. *)
		charTypeCache, mirrorPropCache : CharacterPropertyCache;

		(* Loads the UnicodeData.txt into memory and creates the caches. *)
		PROCEDURE &Init*;
		BEGIN
			filename := "UnicodeData.txt";
			LoadTxtFile;
			NEW(charTypeCache,256);
			NEW(mirrorPropCache,256);
		END Init;

		(* The property file has no leading comments. Therefore there are no lines to be skipped *)
		PROCEDURE FindStartPos;
		BEGIN
			startPos := 0;
		END FindStartPos;


		(* Returns the bidirectional character type for a specific character *)
		PROCEDURE GetBidiCharacterType*(char : Texts.Char32; VAR res : Strings.String);
		VAR
			tempRes : ARRAY 16 OF CHAR;
		BEGIN

			(* firstly, the appropriate cache is searched for an entry of this character *)
			charTypeCache.SLookup(char,tempRes);

			(* if nothing was found the file is read and the result is added to the cache. *)
			IF tempRes = "" THEN
				GetProperty(char,4,res^);
				IF res^ = "" THEN
					res^ := "L";
					KernelLog.String("no character type has been found. Using 'L'"); KernelLog.Ln;
				END;
				charTypeCache.SInsert(char,res^);
			ELSE
				Strings.Copy(tempRes,0,LEN(tempRes),res^);
			END;
		END GetBidiCharacterType;

		(* Checks if a specific character has its 'mirrored' property set to 'yes' *)
		PROCEDURE IsMirroredChar*(char : Texts.Char32) : BOOLEAN;
		VAR
			res : ARRAY 16 OF CHAR;
		BEGIN

			(* firstly, the appropriate cache is searched for an entry of this character *)
			mirrorPropCache.SLookup(char,res);

			(* if nothing was found the file is read and the result is added to the cache. *)
			IF res = "" THEN
				GetProperty(char,9,res);
				mirrorPropCache.SInsert(char,res);
			END;

			RETURN res = "Y";
		END IsMirroredChar;

		(* Checks if the character type of a specific character is 'WS' *)
		PROCEDURE IsWhiteSpaceChar*(char : Texts.Char32) : BOOLEAN;
		VAR
			res : ARRAY 16 OF CHAR;
		BEGIN

			(* firstly, the appropriate cache is searched for an entry of this character *)
			charTypeCache.SLookup(char,res);

			(* if nothing was found the file is read and the result is added to the cache. *)
			IF res = "" THEN
				GetProperty(char,4,res);
				charTypeCache.SInsert(char,res);
			END;

			RETURN res = "WS";
		END IsWhiteSpaceChar;

		(* Gets the character's property at a certain position (0 being the character itself). *)
		PROCEDURE GetProperty*(char : Texts.Char32; pos : LONGINT; VAR res : ARRAY OF CHAR);
		VAR
			thisChar, thisInt : Texts.Char32;
			i,j : INTEGER;
			dummyVal : LONGINT;
		BEGIN
			text.AcquireRead;
			textReader.SetPosition(startPos);

			(* iterate through characters *)
			LOOP
				i := 0;
				(* iterate through properties *)
				LOOP
					j := 0;
					(* read the current property *)
					REPEAT
						textReader.ReadCh(thisChar);

						(* is end of file reached? *)
						IF (j = 0) & ((thisChar = EOT) OR (thisChar = NUL)) THEN
							res[j] := CHR(0H);
							text.ReleaseRead;
							RETURN;
						END;

						(* store the string if its the character's coded or the wanted property *)
						IF (i = pos) OR (i = 0) THEN
							res[j] := CHR(thisChar);
						END;
						INC(j);
					UNTIL (thisChar = ORD(';')) OR (thisChar = CR) OR (thisChar = LF);

					(* the property has been found *)
					IF (i = pos) THEN
						res[j-1] := CHR(0H);
						text.ReleaseRead;
						RETURN;
					(* the character's code has been found *)
					ELSIF (i = 0) THEN
						res[j-1] := CHR(0H);
						Strings.HexStrToInt(res,thisInt, dummyVal);

						(* carry on if the this was not the wanted character yet *)
						IF (thisInt < char) THEN
							EXIT;
						(* return if the wanted character has already been passed *)
						ELSIF (thisInt > char) THEN
							res[0] := CHR(0H);
							text.ReleaseRead;
							RETURN;
						END;
					(* return if the wanted property has already been passed *)
					ELSIF (i > pos) THEN
						res[0] := CHR(0H);
						text.ReleaseRead;
						RETURN;
					END;
					(* carry on if this was the last property of the line *)
					IF (thisChar = CR) OR (thisChar = LF) THEN
						EXIT;
					END;
					INC(i);
				END;
				NextLine;
			END;

			text.ReleaseRead;
		END GetProperty;

		(* Exported procedure to print the character type cache *)
		PROCEDURE PrintCharTypeCache*;
		BEGIN
			charTypeCache.Print;
		END PrintCharTypeCache;

	END UnicodeTxtReader;


TYPE

	(* TxtReader to read the BidiMirroring.txt file. *)
	BidiMirroringTxtReader*=OBJECT(TxtReader)
	VAR
		mirrorCache : CharacterPropertyCache;

		(* Loads the BidiMirroring.txt into memory *)
		PROCEDURE &Init*;
		BEGIN
			filename := "BidiMirroring.txt";
			LoadTxtFile;
			NEW(mirrorCache,256);
		END Init;


		(* Finds the start position of the relevant data. The mirroring file has a large comment at the beginning,
		so the scanner needs to be set to the first line of interest. *)
		PROCEDURE FindStartPos;
		VAR
			thisChar : LONGINT;
		BEGIN
			thisChar := 0;
			text.AcquireRead;

			(* read the line's first character and skip the line if it's a '#' *)
			textReader.ReadCh(thisChar);
	 		WHILE (thisChar = ORD('#')) DO
	 			NextLine;
	 			textReader.ReadCh(thisChar);
	 		END;

			(* store the start position *)
	 		startPos := textReader.GetPosition();
	 		text.ReleaseRead;
		END FindStartPos;



		(* Reads the next source character. The procedure assumes the scanner to be at the beginning	of the line. *)
		PROCEDURE GetSourceChar() : Texts.Char32;
		VAR
			sourceString : ARRAY 7 OF CHAR;
			sourceInt, tempChar : Texts.Char32;
			i : INTEGER;
			res : LONGINT;
		BEGIN
			sourceInt := -1;
			i := -1;

			(* read the characters that form the code for the source character *)
			REPEAT
				INC(i);
				textReader.ReadCh(tempChar);
				sourceString[i] := CHR(tempChar);
			UNTIL (tempChar = EOT) OR (tempChar = ORD('#')) OR (tempChar = ORD(';'));

			(* if the character was terminated by a ';' it is assumed to be valid and is converted into an integer *)
			IF (tempChar = ORD(';')) THEN
				sourceString[i] := CHR(0H);
				Strings.HexStrToInt(sourceString,sourceInt,res);
			END;

			RETURN sourceInt;
		END GetSourceChar;


		(* Reads the next target character. The procedure assumes the scanner to have already read the source character
		and to be now at the beginning of the target character's code. Additionally it assumes the text to be locked. *)
		PROCEDURE GetTargetChar() : Texts.Char32;
		VAR
			targetString : ARRAY 7 OF CHAR;
			targetInt, tempChar : Texts.Char32;
			i : INTEGER;
			res : LONGINT;
		BEGIN
			targetInt := -1;
			i := -1;

			(* read the whitespace *)
			textReader.ReadCh(tempChar);

			(* read the characters that form the code for the target character *)
			REPEAT
				INC(i);
				textReader.ReadCh(tempChar);
				targetString[i] := CHR(tempChar);
			UNTIL (tempChar = EOT) OR (tempChar = ORD('#')) OR (tempChar = SP);

			(* terminate the result string and convert it into an integer *)
			targetString[i] := CHR(0H);
			Strings.HexStrToInt(targetString,targetInt,res);

			RETURN targetInt;
		END GetTargetChar;



		(* Searches the mirror file for a given character and returns its counterpart if found. *)
		PROCEDURE GetMirroredChar*(char : Texts.Char32) : Texts.Char32;
		VAR
			sChar : Texts.Char32;
		BEGIN

			(* look in the cache first *)
			sChar := mirrorCache.CLookup(char);
			IF sChar = -1 THEN
				text.AcquireRead;

				(* search the right source character *)
				textReader.SetPosition(startPos);
				REPEAT
					sChar := GetSourceChar();
					IF (sChar # char) THEN
						NextLine;
					END;
				UNTIL (sChar = char) OR (sChar = -1);	(* if the char is found or if the end of chars is reached, jump out of the loop *)

				(* return Null if the source character could not be found *)
				IF (sChar = -1) THEN
					text.ReleaseRead;
					RETURN 0;
				ELSE
					(* get the target character, store it in the cache and return it *)
					sChar := GetTargetChar();
					mirrorCache.CInsert(char,sChar);
					text.ReleaseRead;
					RETURN sChar;
				END;
			ELSE
				RETURN sChar;
			END;
		END GetMirroredChar;

	END BidiMirroringTxtReader;

END UnicodeProperties.



SystemTools.Free UnicodeProperties ~


UnicodeProperties.TestIsMirroredChar 00000028H ~


PC0.Compile UnicodeProperties.Mod ~