1 {*********************************************************}
3 { Zeos Database Objects }
4 { String tokenizing classes and interfaces }
6 { Originally written by Sergey Seroukhov }
8 {*********************************************************}
10 {@********************************************************}
11 { Copyright (c) 1999-2012 Zeos Development Group }
13 { License Agreement: }
15 { This library is distributed in the hope that it will be }
16 { useful, but WITHOUT ANY WARRANTY; without even the }
17 { implied warranty of MERCHANTABILITY or FITNESS FOR }
18 { A PARTICULAR PURPOSE. See the GNU Lesser General }
19 { Public License for more details. }
21 { The source code of the ZEOS Libraries and packages are }
22 { distributed under the Library GNU General Public }
23 { License (see the file COPYING / COPYING.ZEOS) }
24 { with the following modification: }
25 { As a special exception, the copyright holders of this }
26 { library give you permission to link this library with }
27 { independent modules to produce an executable, }
28 { regardless of the license terms of these independent }
29 { modules, and to copy and distribute the resulting }
30 { executable under terms of your choice, provided that }
31 { you also meet, for each linked independent module, }
32 { the terms and conditions of the license of that module. }
33 { An independent module is a module which is not derived }
34 { from or based on this library. If you modify this }
35 { library, you may extend this exception to your version }
36 { of the library, but you are not obligated to do so. }
37 { If you do not wish to do so, delete this exception }
38 { statement from your version. }
41 { The project web site is located on: }
42 { http://zeos.firmos.at (FORUM) }
43 { http://sourceforge.net/p/zeoslib/tickets/ (BUGTRACKER)}
44 { svn://svn.code.sf.net/p/zeoslib/code-0/trunk (SVN) }
46 { http://www.sourceforge.net/projects/zeoslib. }
49 { Zeos Development Group. }
50 {********************************************************@}
59 Classes, {$IFDEF MSEgui}mclasses,{$ENDIF} SysUtils,
60 ZClasses, ZCompatibility;
65 Objects of this class represent a type of token,
66 such as "number", "symbol" or "word".
68 TZTokenType = (ttUnknown, ttEOF, ttFloat, ttInteger, ttHexDecimal,
69 ttNumber, ttSymbol, ttQuoted, ttQuotedIdentifier, ttWord, ttKeyword,
70 ttWhitespace, ttComment, ttSpecial, ttTime, ttDate, ttDateTime, ttEscape);
73 Defines options for tokenizing strings.
75 TZTokenOption = (toSkipUnknown, toSkipWhitespaces, toSkipComments,
76 toSkipEOF, toUnifyWhitespaces, toUnifyNumbers, toDecodeStrings);
77 TZTokenOptions = set of TZTokenOption;
80 A token represents a logical chunk of a string. For
81 example, a typical tokenizer would break the string
82 <code>"1.23 <= 12.3"</code> into three tokens: the number
83 1.23, a less-than-or-equal symbol, and the number 12.3. A
84 token is a receptacle, and relies on a tokenizer to decide
85 precisely how to divide a string into tokens.
87 TZToken = {$ifndef FPC_REQUIRES_PROPER_ALIGNMENT}packed{$endif} record
89 TokenType: TZTokenType;
92 {** Defines a dynamic array of tokens. }
93 TZTokenDynArray = array of TZToken;
95 // Forward declaration
99 A tokenizerState returns a token, given a reader, an initial character
100 read from the reader, and a tokenizer that is conducting an overall
101 tokenization of the reader. The tokenizer will typically have a character
102 state table that decides which state to use, depending on an initial
103 character. If a single character is insufficient, a state such
104 as <code>SlashState</code> will read a second character, and may delegate
105 to another state, such as <code>SlashStarState</code>. This prospect
106 of delegation is the reason that the <code>nextToken()</code> method has a
109 TZTokenizerState = class (TObject)
111 function NextToken(Stream: TStream; FirstChar: Char;
112 Tokenizer: TZTokenizer): TZToken; virtual; abstract;
117 A <code>EsacapeState</code> object returns bininary/String-data from a reader.
119 state's idea is save work-around of DataSet given binary/String-Data.
120 So it has some requirements to pick out this data from the SQL-
123 First: We have to define one or some Chars to detect this state.
124 Example: If data data was given like;
125 ~<|:%d|<~'...Binary/StringData...'~<|:%d|<~
126 we are able to predetect this State.
128 Second: The parameter d represents an Integer(Count of Chars)
129 if we do not use this it's possible that the Tokenizer is
130 vinny-nilly on getting binary-Data!
132 Third: The GenerigResolver who assambles the insert/update
133 Statements has to add this ass prefix and suffix.
135 Fourth: The User of this Component has to know this too. So has to do this
136 previously if he want to insert/update binary-data in a self
137 assembled Query. So i think it would be better to add an published
138 read-only Property like:
141 If we did this corectly we are able to disassemble all queries and
142 do execute the nessesary UTF8Encoding of the TZQuoteState and
143 TZWordState which represents either Quoted-String-Data or
144 Catalog/Table/Alias/Field name-spaces.
146 This State is only neccessary for <code>Delphi2009+</code> ( 2009 and later)
147 and results of it's mixing nByte-Chars and binary-Data 1Byte-Chars.
149 TZEscapeState = class (TZTokenizerState)
150 function NextToken(Stream: TStream; FirstChar: Char;
151 Tokenizer: TZTokenizer): TZToken; override;
154 A NumberState object returns a number from a reader. This
155 state's idea of a number allows an optional, initial
156 minus sign, followed by one or more digits. A decimal
157 point and another string of digits may follow these digits.
159 TZNumberState = class (TZTokenizerState)
161 function NextToken(Stream: TStream; FirstChar: Char;
162 Tokenizer: TZTokenizer): TZToken; override;
166 A quoteState returns a quoted string token from a reader.
167 This state will collect characters until it sees a match
168 to the character that the tokenizer used to switch to
169 this state. For example, if a tokenizer uses a double-
170 quote character to enter this state, then <code>
171 nextToken()</code> will search for another double-quote
172 until it finds one or finds the end of the reader.
174 TZQuoteState = class (TZTokenizerState)
176 function NextToken(Stream: TStream; FirstChar: Char;
177 Tokenizer: TZTokenizer): TZToken; override;
179 function EncodeString(const Value: string; QuoteChar: Char): string; virtual;
180 function DecodeString(const Value: string; QuoteChar: Char): string; virtual;
184 A CommentState object returns a comment from a reader.
186 TZCommentState = class (TZTokenizerState)
188 function NextToken(Stream: TStream; FirstChar: Char;
189 Tokenizer: TZTokenizer): TZToken; override;
193 This state will either delegate to a comment-handling
194 state, or return a token with just a slash in it.
196 TZCppCommentState = class (TZCommentState)
198 function GetMultiLineComment(Stream: TStream): string; virtual;
199 function GetSingleLineComment(Stream: TStream): string; virtual;
201 function NextToken(Stream: TStream; FirstChar: Char;
202 Tokenizer: TZTokenizer): TZToken; override;
206 This state will either delegate to a comment-handling
207 state, or return a token with just a slash in it.
209 TZCCommentState = class (TZCppCommentState)
211 function NextToken(Stream: TStream; FirstChar: Char;
212 Tokenizer: TZTokenizer): TZToken; override;
215 {*Fix for C++ Builder hpp generation bug - #817612 *}
216 (*$HPPEMIT 'namespace Ztokenizer {class DELPHICLASS TZSymbolNode;}' *)
217 // Forward declaration
218 TZSymbolNode = class;
219 TZSymbolNodeArray = array of TZSymbolNode;
222 A <code>SymbolNode</code> object is a member of a tree that
223 contains all possible prefixes of allowable symbols. Multi-
224 character symbols appear in a <code>SymbolNode</code> tree
225 with one node for each character.
227 For example, the symbol <code>=:~</code> will appear in a
228 tree as three nodes. The first node contains an equals sign,
229 and has a child; that child contains a colon and has a
230 child; this third child contains a tilde, and has no
231 children of its own. If the colon node had another child
232 for a dollar sign character, then the tree would contain
233 the symbol <code>=:$</code>.
235 A tree of <code>SymbolNode</code> objects collaborate to
236 read a (potentially multi-character) symbol from an input
237 stream. A root node with no character of its own finds an
238 initial node that represents the first character in the
239 input. This node looks to see if the next character in the
240 stream matches one of its children. If so, the node
241 delegates its reading task to its child. This approach
242 walks down the tree, pulling symbols from the input that
243 match the path down the tree.
245 When a node does not have a child that matches the next
246 character, we will have read the longest possible symbol
247 prefix. This prefix may or may not be a valid symbol.
248 Consider a tree that has had <code>=:~</code> added and has
249 not had <code>=:</code> added. In this tree, of the three
250 nodes that contain <code>=:~</code>, only the first and
251 third contain complete symbols. If, say, the input contains
252 <code>=:a</code>, the colon node will not have a child that
253 matches the 'a' and so it will stop reading. The colon node
254 has to "unread": it must push back its character, and ask
255 its parent to unread. Unreading continues until it reaches
256 an ancestor that represents a valid symbol.
258 TZSymbolNode = class (TObject)
261 FChildren: TZSymbolNodeArray;
263 FParent: TZSymbolNode;
265 procedure AddDescendantLine(const Value: string);
266 function DeepestRead(Stream: TStream): TZSymbolNode;
267 function EnsureChildWithChar(Value: Char): TZSymbolNode;
268 function FindChildWithChar(Value: Char): TZSymbolNode; virtual;
269 function FindDescendant(const Value: string): TZSymbolNode;
270 function UnreadToValid(Stream: TStream): TZSymbolNode;
272 property Children: TZSymbolNodeArray read FChildren write FChildren;
273 property Character: Char read FCharacter write FCharacter;
274 property Valid: Boolean read FValid write FValid;
275 property Parent: TZSymbolNode read FParent write FParent;
277 constructor Create(Parent: TZSymbolNode; Character: Char);
278 destructor Destroy; override;
280 function Ancestry: string; virtual;
284 This class is a special case of a <code>SymbolNode</code>. A
285 <code>SymbolRootNode</code> object has no symbol of its
286 own, but has children that represent all possible symbols.
288 TZSymbolRootNode = class (TZSymbolNode)
290 function FindChildWithChar(Value: Char): TZSymbolNode; override;
294 procedure Add(const Value: string);
295 function Ancestry: string; override;
296 function NextSymbol(Stream: TStream; FirstChar: Char): string;
300 The idea of a symbol is a character that stands on its
301 own, such as an ampersand or a parenthesis. For example,
302 when tokenizing the expression <code>(isReady)&
303 (isWilling) </code>, a typical tokenizer would return 7
304 tokens, including one for each parenthesis and one for
305 the ampersand. Thus a series of symbols such as
306 <code>)&( </code> becomes three tokens, while a series
307 of letters such as <code>isReady</code> becomes a single
310 Multi-character symbols are an exception to the rule
311 that a symbol is a standalone character. For example, a
312 tokenizer may want less-than-or-equals to tokenize as a
313 single token. This class provides a method for
314 establishing which multi-character symbols an object of
315 this class should treat as single symbols. This allows,
316 for example, <code>"cat <= dog"</code> to tokenize as
317 three tokens, rather than splitting the less-than and
318 equals symbols into separate tokens.
320 By default, this state recognizes the following multi-
321 character symbols: <code>!=, :-, <=, >=</code>
323 TZSymbolState = class (TZTokenizerState)
325 FSymbols: TZSymbolRootNode;
327 property Symbols: TZSymbolRootNode read FSymbols write FSymbols;
330 destructor Destroy; override;
332 function NextToken(Stream: TStream; FirstChar: Char;
333 Tokenizer: TZTokenizer): TZToken; override;
334 procedure Add(const Value: string); virtual;
338 A whitespace state ignores whitespace (such as blanks
339 and tabs), and returns the tokenizer's next token. By
340 default, all characters from 0 to 32 are whitespace.
342 TZWhitespaceState = class (TZTokenizerState)
344 FWhitespaceChars: array[0..ord(high(char))] of Boolean;
348 function NextToken(Stream: TStream; FirstChar: Char;
349 Tokenizer: TZTokenizer): TZToken; override;
350 procedure SetWhitespaceChars(FromChar: Char; ToChar: Char; Enable: Boolean);
354 A wordState returns a word from a reader. Like other
355 states, a tokenizer transfers the job of reading to this
356 state, depending on an initial character. Thus, the
357 tokenizer decides which characters may begin a word, and
358 this state determines which characters may appear as a
359 second or later character in a word. These are typically
360 different sets of characters; in particular, it is typical
361 for digits to appear as parts of a word, but not as the
362 initial character of a word.
364 By default, the following characters may appear in a word.
365 The method <code>setWordChars()</code> allows customizing
373 as well as: minus sign, underscore, and apostrophe.
376 TZWordState = class (TZTokenizerState)
378 FWordChars: array[0..ord(high(char))] of Boolean;
382 function NextToken(Stream: TStream; FirstChar: Char;
383 Tokenizer: TZTokenizer): TZToken; override;
384 procedure SetWordChars(FromChar: Char; ToChar: Char; Enable: Boolean);
388 A tokenizer divides a string into tokens. This class is
389 highly customizable with regard to exactly how this division
390 occurs, but it also has defaults that are suitable for many
391 languages. This class assumes that the character values read
392 from the string lie in the range 0-255. For example, the
393 Unicode value of a capital A is 65, so
394 <code> System.out.println((char)65); </code> prints out a
397 The behavior of a tokenizer depends on its character state
398 table. This table is an array of 256 <code>TokenizerState
399 </code> states. The state table decides which state to
400 enter upon reading a character from the input string.
402 For example, by default, upon reading an 'A', a tokenizer
403 will enter a "word" state. This means the tokenizer will
404 ask a <code>WordState</code> object to consume the 'A',
405 along with the characters after the 'A' that form a word.
406 The state's responsibility is to consume characters and
407 return a complete token.
409 The default table sets a SymbolState for every character
410 from 0 to 255, and then overrides this with:
413 0 ' ' whitespaceState
424 In addition to allowing modification of the state table,
425 this class makes each of the states above available. Some
426 of these states are customizable. For example, wordState
427 allows customization of what characters can be part of a
428 word, after the first character.
430 IZTokenizer = interface (IZInterface)
431 ['{C7CF190B-C45B-4AB4-A406-5999643DF6A0}']
433 function TokenizeBufferToList(const Buffer: string; Options: TZTokenOptions):
435 function TokenizeStreamToList(Stream: TStream; Options: TZTokenOptions):
438 function TokenizeBuffer(const Buffer: string; Options: TZTokenOptions):
440 function TokenizeStream(Stream: TStream; Options: TZTokenOptions):
443 function GetCommentState: TZCommentState;
444 function GetNumberState: TZNumberState;
445 function GetQuoteState: TZQuoteState;
446 function GetSymbolState: TZSymbolState;
447 function GetWhitespaceState: TZWhitespaceState;
448 function GetWordState: TZWordState;
449 function GetCharacterState(StartChar: Char): TZTokenizerState;
450 function AnsiGetEscapeString(const Ansi: RawByteString): String;
451 {$IF defined(FPC) and defined(WITH_RAWBYTESTRING)}
452 function GetEscapeString(const EscapeString: RawByteString): RawByteString;
454 function GetEscapeString(const EscapeString: String): String;
458 {** Implements a default tokenizer object. }
459 TZTokenizer = class (TZAbstractObject, IZTokenizer)
461 FCharacterStates: array[0..ord(high(char))] of TZTokenizerState;
462 FCommentState: TZCommentState;
463 FNumberState: TZNumberState;
464 FQuoteState: TZQuoteState;
465 FSymbolState: TZSymbolState;
466 FWhitespaceState: TZWhitespaceState;
467 FWordState: TZWordState;
468 FEscapeState: TZEscapeState; //EgonHugeist
470 function CheckEscapeState(const ActualState: TZTokenizerState;
471 Stream: TStream; const FirstChar: Char): TZTokenizerState; virtual;
474 destructor Destroy; override;
476 function AnsiGetEscapeString(const EscapeString: RawByteString): String; virtual;
477 {$IF defined(FPC) and defined(WITH_RAWBYTESTRING)}
478 function GetEscapeString(const EscapeString: RawByteString): RawByteString;
480 function GetEscapeString(const EscapeString: String): String;
482 function TokenizeBufferToList(const Buffer: string; Options: TZTokenOptions):
484 function TokenizeStreamToList(Stream: TStream; Options: TZTokenOptions):
487 function TokenizeBuffer(const Buffer: string; Options: TZTokenOptions):
489 function TokenizeStream(Stream: TStream; Options: TZTokenOptions):
492 function GetCharacterState(StartChar: Char): TZTokenizerState;
493 procedure SetCharacterState(FromChar, ToChar: Char; State: TZTokenizerState);
495 function GetEscapeState: TZEscapeState;
496 function GetCommentState: TZCommentState;
497 function GetNumberState: TZNumberState;
498 function GetQuoteState: TZQuoteState;
499 function GetSymbolState: TZSymbolState;
500 function GetWhitespaceState: TZWhitespaceState;
501 function GetWordState: TZWordState;
503 property EscapeState: TZEscapeState read FEscapeState write FEscapeState;
504 property CommentState: TZCommentState read FCommentState write FCommentState;
505 property NumberState: TZNumberState read FNumberState write FNumberState;
506 property QuoteState: TZQuoteState read FQuoteState write FQuoteState;
507 property SymbolState: TZSymbolState read FSymbolState write FSymbolState;
508 property WhitespaceState: TZWhitespaceState read FWhitespaceState
509 write FWhitespaceState;
510 property WordState: TZWordState read FWordState write FWordState;
514 EscapeMarkSequence = String('~<|');
526 { TZEscapeState } //EgonHugeist
529 Return a quoted Escape-data-string of token from a reader. This method
530 will collect characters until it sees a match to the
531 character that the tokenizer used to switch to this state.
533 @return a quoted string token from a reader
535 function TZEscapeState.NextToken(Stream: TStream; FirstChar: Char;
536 Tokenizer: TZTokenizer): TZToken;
539 TempStr, LenString: string;
540 I, IReadCount: Integer;
542 function ReadNextCharToTempChar: Boolean;
545 if Stream.Read(TempChar, 1 * SizeOf(Char)) = 0 then
554 procedure RollbackStream;
556 Stream.Seek(-(iReadCount * SizeOf(Char)), soFromCurrent);
561 function CheckMarkChars(Marks: String): Boolean;
567 if ( TempChar = Copy(Marks, 1, 1) ) then
568 for iMark := 2 to Length(Marks) do //First Char was predetected
570 if ReadNextCharToTempChar then
572 if not ( TempChar = Copy(Marks, iMark, 1) ) then
591 function ReadLengthString: String;
595 Result := ''; //init value
597 B := ReadNextCharToTempChar;
599 if CharInSet(TempChar, ['0'..'9']) then
600 Result := Result+TempChar;
601 until ( not CharInSet(TempChar, ['0'..'9'])) or ( not B );
604 Result.TokenType := ttUnknown;
607 iReadCount := 0; //init Value
609 TempChar := FirstChar; //FirstChar: ~
611 if not CheckMarkChars(EscapeMarkSequence) then Exit;
613 //All inMark-Chars where test.
614 //Now Check for Numeric Chars until MarkOut was found or #0 was Resulted
615 LenString := ReadLengthString;
616 if LenString = '' then
622 //Now Check the TempChar for it's hits on cBinDetectCharsOut
623 if not CheckMarkChars(ReverseString(EscapeMarkSequence)) then Exit;
625 //OutMarks where Found too. So let's read the BinarayData to the TempStr
626 //Including the Quotes
627 for i := 0 to StrToInt(LenString) do
629 if not ReadNextCharToTempChar then
632 TempStr := TempStr + TempChar;
634 //Done and still in here! Post Data to Result!
635 Result.Value := Copy(TempStr, 1, Length(TempStr)-1);
637 //Now Check for in Chars again..
638 if not CheckMarkChars(EscapeMarkSequence) then Exit;
639 //MarkIn-Chars where found now compare the read-length
640 TempStr := LenString; //Save to before compare
641 LenString := ReadLengthString;
642 if ( LenString = '' ) or ( LenString <> TempStr ) then
648 //Now Check the TempChar for it's hits on Escape-Detect-CharsOut again..
649 if not CheckMarkChars(ReverseString(EscapeMarkSequence)) then Exit;
650 //MarkOut-Chars where found again now we are ready here
652 //everything was fine! Now we are sure Escape data was here
653 Result.TokenType := ttEscape;
660 Return a number token from a reader.
661 @return a number token from a reader
663 function TZNumberState.NextToken(Stream: TStream; FirstChar: Char;
664 Tokenizer: TZTokenizer): TZToken;
667 AbsorbedLeadingMinus: Boolean;
668 AbsorbedDot: Boolean;
671 function AbsorbDigits: string;
674 while CharInSet(FirstChar, ['0'..'9']) do
677 Result := Result + FirstChar;
678 ReadNum := Stream.Read(FirstChar, 1 * SizeOf(Char));
685 { Initializes the process. }
687 AbsorbedLeadingMinus := False;
688 AbsorbedDot := False;
691 Result.TokenType := ttUnknown;
694 { Parses left part of the number. }
695 if FirstChar = '-' then
697 ReadNum := Stream.Read(FirstChar, 1 * SizeOf(Char));
699 AbsorbedLeadingMinus := True;
701 Result.Value := Result.Value + AbsorbDigits;
703 { Parses right part of the number. }
704 if FirstChar = '.' then
707 Result.Value := Result.Value + '.';
708 ReadNum := Stream.Read(FirstChar, 1 * SizeOf(Char));
710 Result.Value := Result.Value + AbsorbDigits;
713 { Pushback wrong symbols. }
714 Stream.Seek(-ReadNum, soFromCurrent);
716 { Gets a token result. }
717 if not GotAdigit then
719 if AbsorbedLeadingMinus and AbsorbedDot then
721 Stream.Seek(-(1 * SizeOf(Char)), soFromCurrent);
722 if Tokenizer.SymbolState <> nil then
723 Result := Tokenizer.SymbolState.NextToken(Stream, '-', Tokenizer);
725 else if AbsorbedLeadingMinus then
727 if Tokenizer.SymbolState <> nil then
728 Result := Tokenizer.SymbolState.NextToken(Stream, '-', Tokenizer)
730 else if AbsorbedDot then
732 if Tokenizer.SymbolState <> nil then
733 Result := Tokenizer.SymbolState.NextToken(Stream, '.', Tokenizer);
739 Result.TokenType := ttFloat
741 Result.TokenType := ttInteger;
748 Return a quoted string token from a reader. This method
749 will collect characters until it sees a match to the
750 character that the tokenizer used to switch to this state.
752 @return a quoted string token from a reader
754 function TZQuoteState.NextToken(Stream: TStream; FirstChar: Char;
755 Tokenizer: TZTokenizer): TZToken;
760 TempStr := FirstChar;
762 if Stream.Read(TempChar, 1 * SizeOf(Char)) = 0 then
763 TempChar := FirstChar;
764 TempStr := TempStr + TempChar;
765 until TempChar = FirstChar;
767 Result.TokenType := ttQuoted;
768 Result.Value := TempStr;
772 Encodes a string value.
773 @param Value a string value to be encoded.
774 @param QuoteChar a string quote character.
775 @returns an encoded string.
777 function TZQuoteState.EncodeString(const Value: string; QuoteChar: Char): string;
779 Result := QuoteChar + Value + QuoteChar;
783 Decodes a string value.
784 @param Value a string value to be decoded.
785 @param QuoteChar a string quote character.
786 @returns an decoded string.
788 function TZQuoteState.DecodeString(const Value: string; QuoteChar: Char): string;
790 if (Length(Value) >= 2) and (Value[1] = QuoteChar)
791 and (Value[Length(Value)] = Value[1]) then
792 Result := Copy(Value, 2, Length(Value) - 2)
797 { TZBasicCommentState }
800 Either delegate to a comment-handling state, or return a
801 token with just a slash in it.
803 @return either just a slash token, or the results of
804 delegating to a comment-handling state
806 function TZCommentState.NextToken(Stream: TStream; FirstChar: Char;
807 Tokenizer: TZTokenizer): TZToken;
812 ReadStr := FirstChar;
813 while (Stream.Read(ReadChar, 1 * SizeOf(Char)) > 0) and not CharInSet(ReadChar, [#10, #13]) do
814 ReadStr := ReadStr + ReadChar;
815 if CharInSet(ReadChar, [#10, #13]) then
816 Stream.Seek(-(1 * SizeOf(Char)), soFromCurrent);
818 Result.TokenType := ttComment;
819 Result.Value := ReadStr;
822 { TZCppCommentState }
825 Ignore everything up to a closing star and slash, and
826 then return the tokenizer's next token.
827 @return the tokenizer's next token
829 function TZCppCommentState.GetMultiLineComment(Stream: TStream): string;
831 ReadChar, LastChar: Char;
835 while Stream.Read(ReadChar, 1 * SizeOf(Char)) > 0 do
837 Result := Result + ReadChar;
838 if (LastChar = '*') and (ReadChar = '/') then
840 LastChar := ReadChar;
845 Ignore everything up to an end-of-line and return the tokenizer's next token.
846 @return the tokenizer's next token
848 function TZCppCommentState.GetSingleLineComment(Stream: TStream): string;
853 while (Stream.Read(ReadChar, 1 * SizeOf(Char)) > 0) and not CharInSet(ReadChar, [#10, #13]) do
854 Result := Result + ReadChar;
856 // mdaems : for single line comments the line ending must be included
857 // as it should never be stripped off or unified with other whitespace characters
858 if CharInSet(ReadChar, [#10, #13]) then
860 Result := Result + ReadChar;
861 // ludob Linux line terminator is just LF, don't read further if we already have LF
862 if (ReadChar<>#10) and (Stream.Read(ReadChar, 1 * SizeOf(Char)) > 0) then
863 if CharInSet(ReadChar, [#10, #13]) then
864 Result := Result + ReadChar
866 Stream.Seek(-(1 * SizeOf(Char)), soFromCurrent);
871 Either delegate to a comment-handling state, or return a
872 token with just a slash in it.
874 @return either just a slash token, or the results of
875 delegating to a comment-handling state
877 function TZCppCommentState.NextToken(Stream: TStream; FirstChar: Char;
878 Tokenizer: TZTokenizer): TZToken;
883 Result.TokenType := ttUnknown;
884 Result.Value := FirstChar;
886 ReadNum := Stream.Read(ReadChar, 1 * SizeOf(Char));
887 if (ReadNum > 0) and (ReadChar = '*') then
889 Result.TokenType := ttComment;
890 Result.Value := '/*' + GetMultiLineComment(Stream);
892 else if (ReadNum > 0) and (ReadChar = '/') then
894 Result.TokenType := ttComment;
895 Result.Value := '//' + GetSingleLineComment(Stream);
900 Stream.Seek(-(1 * SizeOf(Char)), soFromCurrent);
901 if Tokenizer.SymbolState <> nil then
902 Result := Tokenizer.SymbolState.NextToken(Stream, FirstChar, Tokenizer);
909 Gets a C specific comments like /* */.
910 @return either just a slash token, or the results of
911 delegating to a comment-handling state
913 function TZCCommentState.NextToken(Stream: TStream; FirstChar: Char;
914 Tokenizer: TZTokenizer): TZToken;
919 Result.TokenType := ttUnknown;
920 Result.Value := FirstChar;
922 if FirstChar = '/' then
924 ReadNum := Stream.Read(ReadChar, 1 * SizeOf(Char));
925 if (ReadNum > 0) and (ReadChar = '*') then
927 Result.TokenType := ttComment;
928 Result.Value := '/*' + GetMultiLineComment(Stream);
933 Stream.Seek(-(1 * SizeOf(Char)), soFromCurrent);
937 if (Result.TokenType = ttUnknown) and (Tokenizer.SymbolState <> nil) then
938 Result := Tokenizer.SymbolState.NextToken(Stream, FirstChar, Tokenizer);
944 Constructs a SymbolNode with the given parent, representing
946 @param Parent this node's parent
947 @param Character this node's character
949 constructor TZSymbolNode.Create(Parent: TZSymbolNode; Character: Char);
952 FCharacter := Character;
954 SetLength(FChildren, 256);
958 Destroys this symbol object and cleanups the memory.
960 destructor TZSymbolNode.Destroy;
966 if FChildren[I] <> nil then
971 SetLength(FChildren, 0);
977 Add a line of descendants that represent the characters in the given string.
979 procedure TZSymbolNode.AddDescendantLine(const Value: string);
983 if Length(Value) > 0 then
985 Node := EnsureChildWithChar(Value[1]);
986 Node.AddDescendantLine(Copy(Value, 2, Length(Value) - 1));
991 Show the symbol this node represents.
992 @return the symbol this node represents
994 function TZSymbolNode.Ancestry: string;
996 Result := FParent.Ancestry + FCharacter;
1000 Find the descendant that takes as many characters as possible from the input.
1002 function TZSymbolNode.DeepestRead(Stream: TStream): TZSymbolNode;
1008 ReadNum := Stream.Read(TempChar, 1 * SizeOf(Char));
1010 Node := FindChildWithChar(TempChar)
1016 Stream.Seek(-ReadNum, soFromCurrent);
1020 Result := Node.DeepestRead(Stream);
1024 Find or create a child for the given character.
1026 function TZSymbolNode.EnsureChildWithChar(Value: Char): TZSymbolNode;
1030 Result := FindChildWithChar(Value);
1031 if Result = nil then
1034 while (FChildren[N] <> nil) and (N <= 255) do
1038 Result := TZSymbolNode.Create(Self, Value);
1039 FChildren[N] := Result;
1045 Find a child with the given character.
1047 function TZSymbolNode.FindChildWithChar(Value: Char): TZSymbolNode;
1050 Current: TZSymbolNode;
1053 for I := 0 to 255 do
1055 Current := Children[I];
1056 if (Current = nil) or (Current.Character = Value) then
1065 Find a descendant which is down the path the given string indicates.
1067 function TZSymbolNode.FindDescendant(const Value: string): TZSymbolNode;
1071 if Length(Value) > 0 then
1072 TempChar := Value[1]
1075 Result := FindChildWithChar(TempChar);
1076 if (Length(Value) > 1) and (Result <> nil) then
1077 Result := Result.FindDescendant(Copy(Value, 2, Length(Value) - 1));
1081 Unwind to a valid node; this node is "valid" if its
1082 ancestry represents a complete symbol. If this node is
1083 not valid, put back the character and ask the parent to unwind.
1085 function TZSymbolNode.UnreadToValid(Stream: TStream): TZSymbolNode;
1089 Stream.Seek(-(1 * SizeOf(Char)), soFromCurrent);
1090 Result := FParent.UnreadToValid(Stream);
1096 { TZSymbolRootNode }
1099 Create and initialize a root node.
1101 constructor TZSymbolRootNode.Create;
1105 inherited Create(nil, #0);
1107 for I := 0 to 255 do
1109 FChildren[I] := TZSymbolNode.Create(Self, Chr(I));
1110 FChildren[I].Valid := True;
1115 Add the given string as a symbol.
1116 @param String the character sequence to add
1118 procedure TZSymbolRootNode.Add(const Value: string);
1123 if Length(Value) > 0 then
1124 TempChar := Value[1]
1127 Node := EnsureChildWithChar(TempChar);
1128 Node.AddDescendantLine(Copy(Value, 2, Length(Value) - 1));
1129 FindDescendant(Value).Valid := True;
1133 A root node has no parent and no character of its own, so its ancestry is "".
1134 @return an empty string
1136 function TZSymbolRootNode.Ancestry: string;
1142 A root node maintains its children in an array instead of
1143 a Vector, to be faster.
1145 function TZSymbolRootNode.FindChildWithChar(Value: Char): TZSymbolNode;
1147 Result := FChildren[Ord(Value)];
1151 Return a symbol string from a reader.
1153 @param Stream a reader to read from
1154 @param FirstChar the first character of this symbol, already
1155 read from the reader
1156 @return a symbol string from a reader
1158 function TZSymbolRootNode.NextSymbol(Stream: TStream; FirstChar: Char): string;
1162 Node := FindChildWithChar(FirstChar);
1163 Node := Node.DeepestRead(Stream);
1164 Node := Node.UnreadToValid(Stream);
1165 Result := Node.Ancestry;
1171 Constructs a symbol state with a default idea of what
1172 multi-character symbols to accept (as described in the class comment).
1174 constructor TZSymbolState.Create;
1176 FSymbols := TZSymbolRootNode.Create;
1180 Destroys this object and cleanups the memory.
1182 destructor TZSymbolState.Destroy;
1189 Add a multi-character symbol.
1190 @param Value the symbol to add, such as "=:="
1192 procedure TZSymbolState.Add(const Value: string);
1194 FSymbols.Add(Value);
1198 Return a symbol token from a reader.
1199 @return a symbol token from a reader
1201 function TZSymbolState.NextToken(Stream: TStream; FirstChar: Char;
1202 Tokenizer: TZTokenizer): TZToken;
1204 Result.TokenType := ttSymbol;
1205 Result.Value := FSymbols.NextSymbol(Stream, FirstChar);
1208 { TZWhitespaceState }
1211 Constructs a whitespace state with a default idea of what
1212 characters are, in fact, whitespace.
1214 constructor TZWhitespaceState.Create;
1216 SetWhitespaceChars(' ', high(char), False);
1217 SetWhitespaceChars(Chr(0), ' ', True);
1221 Ignore whitespace (such as blanks and tabs), and return
1222 the tokenizer's next token.
1223 @return the tokenizer's next token
1225 function TZWhitespaceState.NextToken(Stream: TStream; FirstChar: Char;
1226 Tokenizer: TZTokenizer): TZToken;
1232 ReadStr := FirstChar;
1236 ReadNum := Stream.Read(ReadChar, 1 * SizeOf(Char));
1237 if (ReadNum = 0) or not FWhitespaceChars[Ord(ReadChar)] then
1239 ReadStr := ReadStr + ReadChar;
1243 Stream.Seek(-(1 * SizeOf(Char)), soFromCurrent);
1244 Result.TokenType := ttWhitespace;
1245 Result.Value := ReadStr;
1249 Establish the given characters as whitespace to ignore.
1250 @param FromChar first character index.
1251 @param ToChar last character index.
1252 @param Enable true, if this state should ignore characters in the given range
1254 procedure TZWhitespaceState.SetWhitespaceChars(FromChar, ToChar: Char;
1259 for I := Ord(FromChar) to MinIntValue([Ord(ToChar), 255]) do
1260 FWhitespaceChars[I] := Enable;
1266 Constructs a word state with a default idea of what characters
1267 are admissible inside a word (as described in the class comment).
1269 constructor TZWordState.Create;
1271 SetWordChars(#0, #191, False);
1272 SetWordChars(#192, high(char), True);
1273 SetWordChars('a', 'z', True);
1274 SetWordChars('A', 'Z', True);
1275 SetWordChars('0', '9', True);
1276 SetWordChars('-', '-', True);
1277 SetWordChars('_', '_', True);
1278 SetWordChars('''', '''', True);
1282 Return a word token from a reader.
1283 @return a word token from a reader
1285 function TZWordState.NextToken(Stream: TStream; FirstChar: Char;
1286 Tokenizer: TZTokenizer): TZToken;
1294 ReadNum := Stream.Read(TempChar, 1 * SizeOf(Char));
1295 if (ReadNum = 0) or not FWordChars[Ord(TempChar)] then
1297 Value := Value + TempChar;
1301 Stream.Seek(-(1 * SizeOf(Char)), soFromCurrent);
1302 Result.TokenType := ttWord;
1303 Result.Value := Value;
1307 Establish characters in the given range as valid
1308 characters for part of a word after the first character.
1309 Note that the tokenizer must determine which characters
1310 are valid as the beginning character of a word.
1311 @param FromChar first character index.
1312 @param ToChar last character index.
1313 @param Enable true, if this state should ignore characters in the given range
1315 procedure TZWordState.SetWordChars(FromChar, ToChar: Char; Enable: Boolean);
1319 for I := Ord(FromChar) to MinIntValue([Ord(ToChar), Ord(high(char)) ]) do
1320 FWordChars[I] := Enable;
1326 Constructs a tokenizer with a default state table (as
1327 described in the class comment).
1329 constructor TZTokenizer.Create;
1331 FSymbolState := TZSymbolState.Create;
1332 with TZSymbolState(FSymbolState) do
1338 FEscapeState := TZEscapeState.Create;
1339 FNumberState := TZNumberState.Create;
1340 FQuoteState := TZQuoteState.Create;
1341 FWhitespaceState := TZWhitespaceState.Create;
1342 FWordState := TZWordState.Create;
1343 FCommentState := TZCppCommentState.Create;
1345 SetCharacterState(#0, #32, FWhitespaceState);
1346 SetCharacterState(#33, #191, FSymbolState);
1347 SetCharacterState(#192, High(Char), FWordState);
1349 SetCharacterState('a', 'z', FWordState);
1350 SetCharacterState('A', 'Z', FWordState);
1351 SetCharacterState('0', '9', FNumberState);
1352 SetCharacterState('-', '-', FNumberState);
1353 SetCharacterState('.', '.', FNumberState);
1354 SetCharacterState('"', '"', FQuoteState);
1355 SetCharacterState('''', '''', FQuoteState);
1356 SetCharacterState('/', '/', FCommentState);
1360 Destroys this object and cleanups the memory.
1362 destructor TZTokenizer.Destroy;
1364 if FEscapeState <> nil then
1366 if FCommentState <> nil then
1368 if FNumberState <> nil then
1370 if FQuoteState <> nil then
1372 if FSymbolState <> nil then
1374 if FWhitespaceState <> nil then
1375 FWhitespaceState.Free;
1376 if FWordState <> nil then
1383 Gets an initial state object for the specified character.
1384 @return an initial state object for the character.
1386 function TZTokenizer.GetCharacterState(StartChar: Char): TZTokenizerState;
1388 Result := FCharacterStates[Ord(StartChar)];
1392 Change the state the tokenizer will enter upon reading
1393 any character between "from" and "to".
1395 @param FromChar first character index.
1396 @param ToChar last character index.
1397 @param State the state to enter upon reading a
1398 character between "fromChar" and "toChar"
1400 procedure TZTokenizer.SetCharacterState(FromChar, ToChar: Char;
1401 State: TZTokenizerState);
1405 ORDMAXCHAR = ord(high(char));
1407 for I := Ord(FromChar) to MinIntValue([Ord(ToChar), ORDMAXCHAR]) do
1408 FCharacterStates[I] := State;
1412 Tokenizes a string buffer into a dynamic array of tokens.
1413 @param Buffer a string buffer to be tokenized.
1414 @param Options a set of tokenizer options.
1415 @returns a dynamic array of tokens
1417 function TZTokenizer.TokenizeBuffer(const Buffer: string;
1418 Options: TZTokenOptions): TZTokenDynArray;
1422 Stream := TStringStream.Create(Buffer{$IFDEF WITH_TENCODING_CLASS}, TEncoding.Unicode{$ENDIF});
1424 Result := TokenizeStream(Stream, Options);
1430 function TZTokenizer.AnsiGetEscapeString(const EscapeString: RawByteString): String;
1434 Temp := EscapeMarkSequence+IntToStr(Length(EscapeString))+ReverseString(EscapeMarkSequence);
1436 if Length(EscapeString) > 0 then
1437 Result := Temp+String(EscapeString)+Temp
1442 {$IF defined(FPC) and defined(WITH_RAWBYTESTRING)}
1443 function TZTokenizer.GetEscapeString(const EscapeString: RawByteString): RawByteString;
1445 function TZTokenizer.GetEscapeString(const EscapeString: String): String;
1450 Temp := EscapeMarkSequence+IntToStr(Length(EscapeString))+ReverseString(EscapeMarkSequence);
1452 if Length(EscapeString) > 0 then
1453 {$IF defined(FPC) and defined(WITH_RAWBYTESTRING)}
1454 Result := RawByteString(Temp)+EscapeString+RawByteString(Temp)
1456 Result := Temp+EscapeString+Temp
1464 Checks if SymboState is EscapeState and sets it ...
1465 @param Stream the Read-Stream which has to checked for Next-Chars.
1466 @FirstChar The FirstChar which was readed and sets the Symbolstate
1467 @returns either the given SymbolState or the EscapeState
1469 function TZTokenizer.CheckEscapeState(const ActualState: TZTokenizerState;
1470 Stream: TStream; const FirstChar: Char): TZTokenizerState;
1473 iReadCount, I: Integer;
1475 Result := ActualState;
1477 if ( FirstChar = EscapeMarkSequence[1]) then //Token was set so check if its Escape
1479 for i := 2 to Length(EscapeMarkSequence) do
1480 if Stream.Read(NextChar, 1 * SizeOf(Char)) > 0 then //Read next Char
1482 Inc(IReadCount); //increment count of read-Chars
1483 if NextChar <> EscapeMarkSequence[I] then //Compare Chars
1485 Stream.Seek(-(iReadCount * SizeOf(Char)), soFromCurrent); //Seek Stream back to starting Position
1493 Stream.Seek(-(iReadCount * SizeOf(Char)), soFromCurrent); //Seek Stream back to starting Position
1494 Result := Self.EscapeState;
1498 Tokenizes a string buffer into a list of tokens.
1499 @param Buffer a string buffer to be tokenized.
1500 @param Options a set of tokenizer options.
1501 @returns a string list where Items are tokens and
1502 Objects are token types.
1504 function TZTokenizer.TokenizeBufferToList(const Buffer: string;
1505 Options: TZTokenOptions): TStrings;
1509 Stream := TStringStream.Create(Buffer{$IFDEF WITH_TENCODING_CLASS}, TEncoding.Unicode{$ENDIF});
1511 Result := TokenizeStreamToList(Stream, Options);
1518 Tokenizes a stream into a dynamic array of tokens.
1519 @param Stream a stream to be tokenized.
1520 @param Options a set of tokenizer options.
1521 @returns a dynamic array of tokens
1523 function TZTokenizer.TokenizeStream(Stream: TStream;
1524 Options: TZTokenOptions): TZTokenDynArray;
1529 List := TokenizeStreamToList(Stream, Options);
1531 SetLength(Result, List.Count);
1532 for I := 0 to List.Count - 1 do
1534 Result[I].Value := List[I];
1535 Result[I].TokenType := TZTokenType({$IFDEF FPC}Pointer({$ENDIF}
1536 List.Objects[I]{$IFDEF FPC}){$ENDIF});
1544 Tokenizes a stream into a string list of tokens.
1545 @param Stream a stream to be tokenized.
1546 @param Options a set of tokenizer options.
1547 @returns a string list where Items are tokens and
1548 Objects are token types.
1550 function TZTokenizer.TokenizeStreamToList(Stream: TStream;
1551 Options: TZTokenOptions): TStrings;
1555 LastTokenType: TZTokenType;
1556 State: TZTokenizerState;
1558 Result := TStringList.Create;
1559 LastTokenType := ttUnknown;
1561 while Stream.Read(FirstChar, 1 * SizeOf(Char)) > 0 do
1563 State := FCharacterStates[Ord(FirstChar)];
1564 if State <> nil then
1566 State := CheckEscapeState(State, Stream, FirstChar);
1568 Token := State.NextToken(Stream, FirstChar, Self);
1570 if (State is TZQuoteState)
1571 and (toDecodeStrings in Options) then
1573 Token.Value := (State as TZQuoteState).DecodeString(
1574 Token.Value, FirstChar);
1576 { Skips comments if option set. }
1577 if (Token.TokenType = ttComment)
1578 and (toSkipComments in Options) then
1580 { Skips whitespaces if option set. }
1581 if (Token.TokenType = ttWhitespace)
1582 and (toSkipWhitespaces in Options) then
1584 { Unifies whitespaces if option set. }
1585 if (Token.TokenType = ttWhitespace)
1586 and (toUnifyWhitespaces in Options) then
1588 if LastTokenType = ttWhitespace then
1592 { Unifies numbers if option set. }
1593 if (Token.TokenType in [ttInteger, ttFloat, ttHexDecimal])
1594 and (toUnifyNumbers in Options) then
1595 Token.TokenType := ttNumber;
1596 { If an integer is immediately followed by a string they should be seen as one string}
1597 if ((Token.TokenType = ttWord)and(LastTokenType = ttInteger)) then
1599 Token.Value := Result[Result.Count-1] + Token.Value;
1600 Result.Delete(Result.Count-1);
1602 { Add a read token. }
1603 LastTokenType := Token.TokenType;
1604 Result.AddObject(Token.Value, TObject(Ord(Token.TokenType)));
1606 { Skips unknown chars if option set. }
1607 else if not (toSkipUnknown in Options) then
1608 Result.AddObject(FirstChar, TObject(Ord(ttUnknown)));
1610 { Adds an EOF if option is not set. }
1611 if not (toSkipEOF in Options) then
1612 Result.AddObject('', TObject(Ord(ttEOF)));
1616 Gets a tokenizer default Escape state.
1617 @returns a tokenizer default Escape state.
1619 function TZTokenizer.GetEscapeState: TZEscapeState;
1621 Result := EscapeState;
1625 Gets a tokenizer default comment state.
1626 @returns a tokenizer default comment state.
1628 function TZTokenizer.GetCommentState: TZCommentState;
1630 Result := CommentState;
1634 Gets a tokenizer default number state.
1635 @returns a tokenizer default number state.
1637 function TZTokenizer.GetNumberState: TZNumberState;
1639 Result := NumberState;
1643 Gets a tokenizer default quote state.
1644 @returns a tokenizer default quote state.
1646 function TZTokenizer.GetQuoteState: TZQuoteState;
1648 Result := QuoteState;
1652 Gets a tokenizer default symbol state.
1653 @returns a tokenizer default symbol state.
1655 function TZTokenizer.GetSymbolState: TZSymbolState;
1657 Result := SymbolState;
1661 Gets a tokenizer default whitespace state.
1662 @returns a tokenizer default whitespace state.
1664 function TZTokenizer.GetWhitespaceState: TZWhitespaceState;
1666 Result := WhitespaceState;
1670 Gets a tokenizer default word state.
1671 @returns a tokenizer default word state.
1673 function TZTokenizer.GetWordState: TZWordState;
1675 Result := WordState;