zeoslib  UNKNOWN
 All Files
ZTokenizer.pas
Go to the documentation of this file.
1 {*********************************************************}
2 { }
3 { Zeos Database Objects }
4 { String tokenizing classes and interfaces }
5 { }
6 { Originally written by Sergey Seroukhov }
7 { }
8 {*********************************************************}
9 
10 {@********************************************************}
11 { Copyright (c) 1999-2012 Zeos Development Group }
12 { }
13 { License Agreement: }
14 { }
15 { This library is distributed in the hope that it will be }
16 { useful, but WITHOUT ANY WARRANTY; without even the }
17 { implied warranty of MERCHANTABILITY or FITNESS FOR }
18 { A PARTICULAR PURPOSE. See the GNU Lesser General }
19 { Public License for more details. }
20 { }
21 { The source code of the ZEOS Libraries and packages are }
22 { distributed under the Library GNU General Public }
23 { License (see the file COPYING / COPYING.ZEOS) }
24 { with the following modification: }
25 { As a special exception, the copyright holders of this }
26 { library give you permission to link this library with }
27 { independent modules to produce an executable, }
28 { regardless of the license terms of these independent }
29 { modules, and to copy and distribute the resulting }
30 { executable under terms of your choice, provided that }
31 { you also meet, for each linked independent module, }
32 { the terms and conditions of the license of that module. }
33 { An independent module is a module which is not derived }
34 { from or based on this library. If you modify this }
35 { library, you may extend this exception to your version }
36 { of the library, but you are not obligated to do so. }
37 { If you do not wish to do so, delete this exception }
38 { statement from your version. }
39 { }
40 { }
41 { The project web site is located on: }
42 { http://zeos.firmos.at (FORUM) }
43 { http://sourceforge.net/p/zeoslib/tickets/ (BUGTRACKER)}
44 { svn://svn.code.sf.net/p/zeoslib/code-0/trunk (SVN) }
45 { }
46 { http://www.sourceforge.net/projects/zeoslib. }
47 { }
48 { }
49 { Zeos Development Group. }
50 {********************************************************@}
51 
52 unit ZTokenizer;
53 
54 interface
55 
56 {$I ZCore.inc}
57 
58 uses
59  Classes, {$IFDEF MSEgui}mclasses,{$ENDIF} SysUtils,
60  ZClasses, ZCompatibility;
61 
62 type
63 
64  {**
65  Objects of this class represent a type of token,
66  such as "number", "symbol" or "word".
67  }
68  TZTokenType = (ttUnknown, ttEOF, ttFloat, ttInteger, ttHexDecimal,
69  ttNumber, ttSymbol, ttQuoted, ttQuotedIdentifier, ttWord, ttKeyword,
70  ttWhitespace, ttComment, ttSpecial, ttTime, ttDate, ttDateTime, ttEscape);
71 
72  {**
73  Defines options for tokenizing strings.
74  }
75  TZTokenOption = (toSkipUnknown, toSkipWhitespaces, toSkipComments,
76  toSkipEOF, toUnifyWhitespaces, toUnifyNumbers, toDecodeStrings);
77  TZTokenOptions = set of TZTokenOption;
78 
79  {**
80  A token represents a logical chunk of a string. For
81  example, a typical tokenizer would break the string
82  <code>"1.23 <= 12.3"</code> into three tokens: the number
83  1.23, a less-than-or-equal symbol, and the number 12.3. A
84  token is a receptacle, and relies on a tokenizer to decide
85  precisely how to divide a string into tokens.
86  }
87  TZToken = {$ifndef FPC_REQUIRES_PROPER_ALIGNMENT}packed{$endif} record
88  Value: string;
89  TokenType: TZTokenType;
90  end;
91 
92  {** Defines a dynamic array of tokens. }
93  TZTokenDynArray = array of TZToken;
94 
95  // Forward declaration
96  TZTokenizer = class;
97 
98  {**
99  A tokenizerState returns a token, given a reader, an initial character
100  read from the reader, and a tokenizer that is conducting an overall
101  tokenization of the reader. The tokenizer will typically have a character
102  state table that decides which state to use, depending on an initial
103  character. If a single character is insufficient, a state such
104  as <code>SlashState</code> will read a second character, and may delegate
105  to another state, such as <code>SlashStarState</code>. This prospect
106  of delegation is the reason that the <code>nextToken()</code> method has a
107  tokenizer argument.
108  }
109  TZTokenizerState = class (TObject)
110  public
111  function NextToken(Stream: TStream; FirstChar: Char;
112  Tokenizer: TZTokenizer): TZToken; virtual; abstract;
113  end;
114 
115  {**
116  EgonHugeist:
117  A <code>EsacapeState</code> object returns bininary/String-data from a reader.
118  This
119  state's idea is save work-around of DataSet given binary/String-Data.
120  So it has some requirements to pick out this data from the SQL-
121  String:
122 
123  First: We have to define one or some Chars to detect this state.
124  Example: If data data was given like;
125  ~<|:%d|<~'...Binary/StringData...'~<|:%d|<~
126  we are able to predetect this State.
127 
128  Second: The parameter d represents an Integer(Count of Chars)
129  if we do not use this it's possible that the Tokenizer is
130  vinny-nilly on getting binary-Data!
131 
132  Third: The GenerigResolver who assambles the insert/update
133  Statements has to add this ass prefix and suffix.
134 
135  Fourth: The User of this Component has to know this too. So has to do this
136  previously if he want to insert/update binary-data in a self
137  assembled Query. So i think it would be better to add an published
138  read-only Property like:
139  EscapeChars: String;
140 
141  If we did this corectly we are able to disassemble all queries and
142  do execute the nessesary UTF8Encoding of the TZQuoteState and
143  TZWordState which represents either Quoted-String-Data or
144  Catalog/Table/Alias/Field name-spaces.
145 
146  This State is only neccessary for <code>Delphi2009+</code> ( 2009 and later)
147  and results of it's mixing nByte-Chars and binary-Data 1Byte-Chars.
148  }
149  TZEscapeState = class (TZTokenizerState)
150  function NextToken(Stream: TStream; FirstChar: Char;
151  Tokenizer: TZTokenizer): TZToken; override;
152  end;
153  {**
154  A NumberState object returns a number from a reader. This
155  state's idea of a number allows an optional, initial
156  minus sign, followed by one or more digits. A decimal
157  point and another string of digits may follow these digits.
158  }
159  TZNumberState = class (TZTokenizerState)
160  public
161  function NextToken(Stream: TStream; FirstChar: Char;
162  Tokenizer: TZTokenizer): TZToken; override;
163  end;
164 
165  {**
166  A quoteState returns a quoted string token from a reader.
167  This state will collect characters until it sees a match
168  to the character that the tokenizer used to switch to
169  this state. For example, if a tokenizer uses a double-
170  quote character to enter this state, then <code>
171  nextToken()</code> will search for another double-quote
172  until it finds one or finds the end of the reader.
173  }
174  TZQuoteState = class (TZTokenizerState)
175  public
176  function NextToken(Stream: TStream; FirstChar: Char;
177  Tokenizer: TZTokenizer): TZToken; override;
178 
179  function EncodeString(const Value: string; QuoteChar: Char): string; virtual;
180  function DecodeString(const Value: string; QuoteChar: Char): string; virtual;
181  end;
182 
183  {**
184  A CommentState object returns a comment from a reader.
185  }
186  TZCommentState = class (TZTokenizerState)
187  public
188  function NextToken(Stream: TStream; FirstChar: Char;
189  Tokenizer: TZTokenizer): TZToken; override;
190  end;
191 
192  {**
193  This state will either delegate to a comment-handling
194  state, or return a token with just a slash in it.
195  }
196  TZCppCommentState = class (TZCommentState)
197  protected
198  function GetMultiLineComment(Stream: TStream): string; virtual;
199  function GetSingleLineComment(Stream: TStream): string; virtual;
200  public
201  function NextToken(Stream: TStream; FirstChar: Char;
202  Tokenizer: TZTokenizer): TZToken; override;
203  end;
204 
205  {**
206  This state will either delegate to a comment-handling
207  state, or return a token with just a slash in it.
208  }
209  TZCCommentState = class (TZCppCommentState)
210  public
211  function NextToken(Stream: TStream; FirstChar: Char;
212  Tokenizer: TZTokenizer): TZToken; override;
213  end;
214 
215  {*Fix for C++ Builder hpp generation bug - #817612 *}
216  (*$HPPEMIT 'namespace Ztokenizer {class DELPHICLASS TZSymbolNode;}' *)
217  // Forward declaration
218  TZSymbolNode = class;
219  TZSymbolNodeArray = array of TZSymbolNode;
220 
221  {**
222  A <code>SymbolNode</code> object is a member of a tree that
223  contains all possible prefixes of allowable symbols. Multi-
224  character symbols appear in a <code>SymbolNode</code> tree
225  with one node for each character.
226 
227  For example, the symbol <code>=:~</code> will appear in a
228  tree as three nodes. The first node contains an equals sign,
229  and has a child; that child contains a colon and has a
230  child; this third child contains a tilde, and has no
231  children of its own. If the colon node had another child
232  for a dollar sign character, then the tree would contain
233  the symbol <code>=:$</code>.
234 
235  A tree of <code>SymbolNode</code> objects collaborate to
236  read a (potentially multi-character) symbol from an input
237  stream. A root node with no character of its own finds an
238  initial node that represents the first character in the
239  input. This node looks to see if the next character in the
240  stream matches one of its children. If so, the node
241  delegates its reading task to its child. This approach
242  walks down the tree, pulling symbols from the input that
243  match the path down the tree.
244 
245  When a node does not have a child that matches the next
246  character, we will have read the longest possible symbol
247  prefix. This prefix may or may not be a valid symbol.
248  Consider a tree that has had <code>=:~</code> added and has
249  not had <code>=:</code> added. In this tree, of the three
250  nodes that contain <code>=:~</code>, only the first and
251  third contain complete symbols. If, say, the input contains
252  <code>=:a</code>, the colon node will not have a child that
253  matches the 'a' and so it will stop reading. The colon node
254  has to "unread": it must push back its character, and ask
255  its parent to unread. Unreading continues until it reaches
256  an ancestor that represents a valid symbol.
257  }
258  TZSymbolNode = class (TObject)
259  private
260  FCharacter: Char;
261  FChildren: TZSymbolNodeArray;
262  FValid: Boolean;
263  FParent: TZSymbolNode;
264  protected
265  procedure AddDescendantLine(const Value: string);
266  function DeepestRead(Stream: TStream): TZSymbolNode;
267  function EnsureChildWithChar(Value: Char): TZSymbolNode;
268  function FindChildWithChar(Value: Char): TZSymbolNode; virtual;
269  function FindDescendant(const Value: string): TZSymbolNode;
270  function UnreadToValid(Stream: TStream): TZSymbolNode;
271 
272  property Children: TZSymbolNodeArray read FChildren write FChildren;
273  property Character: Char read FCharacter write FCharacter;
274  property Valid: Boolean read FValid write FValid;
275  property Parent: TZSymbolNode read FParent write FParent;
276  public
277  constructor Create(Parent: TZSymbolNode; Character: Char);
278  destructor Destroy; override;
279 
280  function Ancestry: string; virtual;
281  end;
282 
283  {**
284  This class is a special case of a <code>SymbolNode</code>. A
285  <code>SymbolRootNode</code> object has no symbol of its
286  own, but has children that represent all possible symbols.
287  }
288  TZSymbolRootNode = class (TZSymbolNode)
289  protected
290  function FindChildWithChar(Value: Char): TZSymbolNode; override;
291  public
292  constructor Create;
293 
294  procedure Add(const Value: string);
295  function Ancestry: string; override;
296  function NextSymbol(Stream: TStream; FirstChar: Char): string;
297  end;
298 
299  {**
300  The idea of a symbol is a character that stands on its
301  own, such as an ampersand or a parenthesis. For example,
302  when tokenizing the expression <code>(isReady)&
303  (isWilling) </code>, a typical tokenizer would return 7
304  tokens, including one for each parenthesis and one for
305  the ampersand. Thus a series of symbols such as
306  <code>)&( </code> becomes three tokens, while a series
307  of letters such as <code>isReady</code> becomes a single
308  word token.
309  <p>
310  Multi-character symbols are an exception to the rule
311  that a symbol is a standalone character. For example, a
312  tokenizer may want less-than-or-equals to tokenize as a
313  single token. This class provides a method for
314  establishing which multi-character symbols an object of
315  this class should treat as single symbols. This allows,
316  for example, <code>"cat <= dog"</code> to tokenize as
317  three tokens, rather than splitting the less-than and
318  equals symbols into separate tokens.
319  <p>
320  By default, this state recognizes the following multi-
321  character symbols: <code>!=, :-, <=, >=</code>
322  }
323  TZSymbolState = class (TZTokenizerState)
324  private
325  FSymbols: TZSymbolRootNode;
326  protected
327  property Symbols: TZSymbolRootNode read FSymbols write FSymbols;
328  public
329  constructor Create;
330  destructor Destroy; override;
331 
332  function NextToken(Stream: TStream; FirstChar: Char;
333  Tokenizer: TZTokenizer): TZToken; override;
334  procedure Add(const Value: string); virtual;
335  end;
336 
337  {**
338  A whitespace state ignores whitespace (such as blanks
339  and tabs), and returns the tokenizer's next token. By
340  default, all characters from 0 to 32 are whitespace.
341  }
342  TZWhitespaceState = class (TZTokenizerState)
343  private
344  FWhitespaceChars: array[0..ord(high(char))] of Boolean;
345  public
346  constructor Create;
347 
348  function NextToken(Stream: TStream; FirstChar: Char;
349  Tokenizer: TZTokenizer): TZToken; override;
350  procedure SetWhitespaceChars(FromChar: Char; ToChar: Char; Enable: Boolean);
351  end;
352 
353  {**
354  A wordState returns a word from a reader. Like other
355  states, a tokenizer transfers the job of reading to this
356  state, depending on an initial character. Thus, the
357  tokenizer decides which characters may begin a word, and
358  this state determines which characters may appear as a
359  second or later character in a word. These are typically
360  different sets of characters; in particular, it is typical
361  for digits to appear as parts of a word, but not as the
362  initial character of a word.
363  <p>
364  By default, the following characters may appear in a word.
365  The method <code>setWordChars()</code> allows customizing
366  this.
367  <blockquote><pre>
368  From To
369  'a', 'z'
370  'A', 'Z'
371  '0', '9'
372 
373  as well as: minus sign, underscore, and apostrophe.
374  </pre></blockquote>
375  }
376  TZWordState = class (TZTokenizerState)
377  private
378  FWordChars: array[0..ord(high(char))] of Boolean;
379  public
380  constructor Create;
381 
382  function NextToken(Stream: TStream; FirstChar: Char;
383  Tokenizer: TZTokenizer): TZToken; override;
384  procedure SetWordChars(FromChar: Char; ToChar: Char; Enable: Boolean);
385  end;
386 
387  {**
388  A tokenizer divides a string into tokens. This class is
389  highly customizable with regard to exactly how this division
390  occurs, but it also has defaults that are suitable for many
391  languages. This class assumes that the character values read
392  from the string lie in the range 0-255. For example, the
393  Unicode value of a capital A is 65, so
394  <code> System.out.println((char)65); </code> prints out a
395  capital A.
396  <p>
397  The behavior of a tokenizer depends on its character state
398  table. This table is an array of 256 <code>TokenizerState
399  </code> states. The state table decides which state to
400  enter upon reading a character from the input string.
401  <p>
402  For example, by default, upon reading an 'A', a tokenizer
403  will enter a "word" state. This means the tokenizer will
404  ask a <code>WordState</code> object to consume the 'A',
405  along with the characters after the 'A' that form a word.
406  The state's responsibility is to consume characters and
407  return a complete token.
408  <p>
409  The default table sets a SymbolState for every character
410  from 0 to 255, and then overrides this with:
411  <blockquote><pre>
412  From To State
413  0 ' ' whitespaceState
414  'a' 'z' wordState
415  'A' 'Z' wordState
416  160 255 wordState
417  '0' '9' numberState
418  '-' '-' numberState
419  '.' '.' numberState
420  '"' '"' quoteState
421  '\'' '\'' quoteState
422  '/' '/' slashState
423  </pre></blockquote>
424  In addition to allowing modification of the state table,
425  this class makes each of the states above available. Some
426  of these states are customizable. For example, wordState
427  allows customization of what characters can be part of a
428  word, after the first character.
429  }
430  IZTokenizer = interface (IZInterface)
431  ['{C7CF190B-C45B-4AB4-A406-5999643DF6A0}']
432 
433  function TokenizeBufferToList(const Buffer: string; Options: TZTokenOptions):
434  TStrings;
435  function TokenizeStreamToList(Stream: TStream; Options: TZTokenOptions):
436  TStrings;
437 
438  function TokenizeBuffer(const Buffer: string; Options: TZTokenOptions):
439  TZTokenDynArray;
440  function TokenizeStream(Stream: TStream; Options: TZTokenOptions):
441  TZTokenDynArray;
442 
443  function GetCommentState: TZCommentState;
444  function GetNumberState: TZNumberState;
445  function GetQuoteState: TZQuoteState;
446  function GetSymbolState: TZSymbolState;
447  function GetWhitespaceState: TZWhitespaceState;
448  function GetWordState: TZWordState;
449  function GetCharacterState(StartChar: Char): TZTokenizerState;
450  function AnsiGetEscapeString(const Ansi: RawByteString): String;
451  {$IF defined(FPC) and defined(WITH_RAWBYTESTRING)}
452  function GetEscapeString(const EscapeString: RawByteString): RawByteString;
453  {$ELSE}
454  function GetEscapeString(const EscapeString: String): String;
455  {$IFEND}
456  end;
457 
458  {** Implements a default tokenizer object. }
459  TZTokenizer = class (TZAbstractObject, IZTokenizer)
460  private
461  FCharacterStates: array[0..ord(high(char))] of TZTokenizerState;
462  FCommentState: TZCommentState;
463  FNumberState: TZNumberState;
464  FQuoteState: TZQuoteState;
465  FSymbolState: TZSymbolState;
466  FWhitespaceState: TZWhitespaceState;
467  FWordState: TZWordState;
468  FEscapeState: TZEscapeState; //EgonHugeist
469  protected
470  function CheckEscapeState(const ActualState: TZTokenizerState;
471  Stream: TStream; const FirstChar: Char): TZTokenizerState; virtual;
472  public
473  constructor Create;
474  destructor Destroy; override;
475 
476  function AnsiGetEscapeString(const EscapeString: RawByteString): String; virtual;
477  {$IF defined(FPC) and defined(WITH_RAWBYTESTRING)}
478  function GetEscapeString(const EscapeString: RawByteString): RawByteString;
479  {$ELSE}
480  function GetEscapeString(const EscapeString: String): String;
481  {$IFEND}
482  function TokenizeBufferToList(const Buffer: string; Options: TZTokenOptions):
483  TStrings;
484  function TokenizeStreamToList(Stream: TStream; Options: TZTokenOptions):
485  TStrings;
486 
487  function TokenizeBuffer(const Buffer: string; Options: TZTokenOptions):
488  TZTokenDynArray;
489  function TokenizeStream(Stream: TStream; Options: TZTokenOptions):
490  TZTokenDynArray;
491 
492  function GetCharacterState(StartChar: Char): TZTokenizerState;
493  procedure SetCharacterState(FromChar, ToChar: Char; State: TZTokenizerState);
494 
495  function GetEscapeState: TZEscapeState;
496  function GetCommentState: TZCommentState;
497  function GetNumberState: TZNumberState;
498  function GetQuoteState: TZQuoteState;
499  function GetSymbolState: TZSymbolState;
500  function GetWhitespaceState: TZWhitespaceState;
501  function GetWordState: TZWordState;
502 
503  property EscapeState: TZEscapeState read FEscapeState write FEscapeState;
504  property CommentState: TZCommentState read FCommentState write FCommentState;
505  property NumberState: TZNumberState read FNumberState write FNumberState;
506  property QuoteState: TZQuoteState read FQuoteState write FQuoteState;
507  property SymbolState: TZSymbolState read FSymbolState write FSymbolState;
508  property WhitespaceState: TZWhitespaceState read FWhitespaceState
509  write FWhitespaceState;
510  property WordState: TZWordState read FWordState write FWordState;
511  end;
512 
513 const
514  EscapeMarkSequence = String('~<|');
515 
516 
517 implementation
518 
519 uses
520  Math, StrUtils;
521 
522 {$IFDEF FPC}
523  {$HINTS OFF}
524 {$ENDIF}
525 
526 { TZEscapeState } //EgonHugeist
527 
528 {**
529  Return a quoted Escape-data-string of token from a reader. This method
530  will collect characters until it sees a match to the
531  character that the tokenizer used to switch to this state.
532 
533  @return a quoted string token from a reader
534 }
535 function TZEscapeState.NextToken(Stream: TStream; FirstChar: Char;
536  Tokenizer: TZTokenizer): TZToken;
537 var
538  TempChar: Char;
539  TempStr, LenString: string;
540  I, IReadCount: Integer;
541 
542  function ReadNextCharToTempChar: Boolean;
543  begin
544  Result := True;
545  if Stream.Read(TempChar, 1 * SizeOf(Char)) = 0 then
546  begin
547  Result := False;
548  TempChar := #0;
549  end
550  else
551  Inc(iReadCount);
552  end;
553 
554  procedure RollbackStream;
555  begin
556  Stream.Seek(-(iReadCount * SizeOf(Char)), soFromCurrent);
557  Result.Value := '';
558  IReadCount := 0;
559  end;
560 
561  function CheckMarkChars(Marks: String): Boolean;
562  var
563  iMark: Integer;
564  begin
565  Result := False;
566 
567  if ( TempChar = Copy(Marks, 1, 1) ) then
568  for iMark := 2 to Length(Marks) do //First Char was predetected
569  begin
570  if ReadNextCharToTempChar then
571  begin
572  if not ( TempChar = Copy(Marks, iMark, 1) ) then
573  begin
574  RollbackStream;
575  Exit;
576  end;
577  end else
578  begin
579  RollbackStream;
580  Exit;
581  end;
582  end
583  else
584  begin
585  RollbackStream;
586  Exit;
587  end;
588  Result := True;
589  end;
590 
591  function ReadLengthString: String;
592  var
593  B: Boolean;
594  begin
595  Result := ''; //init value
596  repeat
597  B := ReadNextCharToTempChar;
598  if B then
599  if CharInSet(TempChar, ['0'..'9']) then
600  Result := Result+TempChar;
601  until ( not CharInSet(TempChar, ['0'..'9'])) or ( not B );
602  end;
603 begin
604  Result.TokenType := ttUnknown;
605  Result.Value := '';
606 
607  iReadCount := 0; //init Value
608  TempStr := '';
609  TempChar := FirstChar; //FirstChar: ~
610 
611  if not CheckMarkChars(EscapeMarkSequence) then Exit;
612 
613  //All inMark-Chars where test.
614  //Now Check for Numeric Chars until MarkOut was found or #0 was Resulted
615  LenString := ReadLengthString;
616  if LenString = '' then
617  begin
618  RollbackStream;
619  Exit;
620  end;
621 
622  //Now Check the TempChar for it's hits on cBinDetectCharsOut
623  if not CheckMarkChars(ReverseString(EscapeMarkSequence)) then Exit;
624 
625  //OutMarks where Found too. So let's read the BinarayData to the TempStr
626  //Including the Quotes
627  for i := 0 to StrToInt(LenString) do
628  begin
629  if not ReadNextCharToTempChar then
630  Exit
631  else
632  TempStr := TempStr + TempChar;
633  end;
634  //Done and still in here! Post Data to Result!
635  Result.Value := Copy(TempStr, 1, Length(TempStr)-1);
636 
637  //Now Check for in Chars again..
638  if not CheckMarkChars(EscapeMarkSequence) then Exit;
639  //MarkIn-Chars where found now compare the read-length
640  TempStr := LenString; //Save to before compare
641  LenString := ReadLengthString;
642  if ( LenString = '' ) or ( LenString <> TempStr ) then
643  begin
644  RollbackStream;
645  Exit;
646  end;
647 
648  //Now Check the TempChar for it's hits on Escape-Detect-CharsOut again..
649  if not CheckMarkChars(ReverseString(EscapeMarkSequence)) then Exit;
650  //MarkOut-Chars where found again now we are ready here
651 
652  //everything was fine! Now we are sure Escape data was here
653  Result.TokenType := ttEscape;
654  //End..
655 end;
656 
657 { TZNumberState }
658 
659 {**
660  Return a number token from a reader.
661  @return a number token from a reader
662 }
663 function TZNumberState.NextToken(Stream: TStream; FirstChar: Char;
664  Tokenizer: TZTokenizer): TZToken;
665 var
666  ReadNum: Integer;
667  AbsorbedLeadingMinus: Boolean;
668  AbsorbedDot: Boolean;
669  GotAdigit: Boolean;
670 
671  function AbsorbDigits: string;
672  begin
673  Result := '';
674  while CharInSet(FirstChar, ['0'..'9']) do
675  begin
676  GotAdigit := True;
677  Result := Result + FirstChar;
678  ReadNum := Stream.Read(FirstChar, 1 * SizeOf(Char));
679  if ReadNum = 0 then
680  Break;
681  end;
682  end;
683 
684 begin
685  { Initializes the process. }
686  ReadNum := 0;
687  AbsorbedLeadingMinus := False;
688  AbsorbedDot := False;
689  GotAdigit := False;
690 
691  Result.TokenType := ttUnknown;
692  Result.Value := '';
693 
694  { Parses left part of the number. }
695  if FirstChar = '-' then
696  begin
697  ReadNum := Stream.Read(FirstChar, 1 * SizeOf(Char));
698  Result.Value := '-';
699  AbsorbedLeadingMinus := True;
700  end;
701  Result.Value := Result.Value + AbsorbDigits;
702 
703  { Parses right part of the number. }
704  if FirstChar = '.' then
705  begin
706  AbsorbedDot := True;
707  Result.Value := Result.Value + '.';
708  ReadNum := Stream.Read(FirstChar, 1 * SizeOf(Char));
709  if ReadNum > 0 then
710  Result.Value := Result.Value + AbsorbDigits;
711  end;
712 
713  { Pushback wrong symbols. }
714  Stream.Seek(-ReadNum, soFromCurrent);
715 
716  { Gets a token result. }
717  if not GotAdigit then
718  begin
719  if AbsorbedLeadingMinus and AbsorbedDot then
720  begin
721  Stream.Seek(-(1 * SizeOf(Char)), soFromCurrent);
722  if Tokenizer.SymbolState <> nil then
723  Result := Tokenizer.SymbolState.NextToken(Stream, '-', Tokenizer);
724  end
725  else if AbsorbedLeadingMinus then
726  begin
727  if Tokenizer.SymbolState <> nil then
728  Result := Tokenizer.SymbolState.NextToken(Stream, '-', Tokenizer)
729  end
730  else if AbsorbedDot then
731  begin
732  if Tokenizer.SymbolState <> nil then
733  Result := Tokenizer.SymbolState.NextToken(Stream, '.', Tokenizer);
734  end;
735  end
736  else
737  begin
738  if AbsorbedDot then
739  Result.TokenType := ttFloat
740  else
741  Result.TokenType := ttInteger;
742  end;
743 end;
744 
745 { TZQuoteState }
746 
747 {**
748  Return a quoted string token from a reader. This method
749  will collect characters until it sees a match to the
750  character that the tokenizer used to switch to this state.
751 
752  @return a quoted string token from a reader
753 }
754 function TZQuoteState.NextToken(Stream: TStream; FirstChar: Char;
755  Tokenizer: TZTokenizer): TZToken;
756 var
757  TempChar: Char;
758  TempStr: string;
759 begin
760  TempStr := FirstChar;
761  repeat
762  if Stream.Read(TempChar, 1 * SizeOf(Char)) = 0 then
763  TempChar := FirstChar;
764  TempStr := TempStr + TempChar;
765  until TempChar = FirstChar;
766 
767  Result.TokenType := ttQuoted;
768  Result.Value := TempStr;
769 end;
770 
771 {**
772  Encodes a string value.
773  @param Value a string value to be encoded.
774  @param QuoteChar a string quote character.
775  @returns an encoded string.
776 }
777 function TZQuoteState.EncodeString(const Value: string; QuoteChar: Char): string;
778 begin
779  Result := QuoteChar + Value + QuoteChar;
780 end;
781 
782 {**
783  Decodes a string value.
784  @param Value a string value to be decoded.
785  @param QuoteChar a string quote character.
786  @returns an decoded string.
787 }
788 function TZQuoteState.DecodeString(const Value: string; QuoteChar: Char): string;
789 begin
790  if (Length(Value) >= 2) and (Value[1] = QuoteChar)
791  and (Value[Length(Value)] = Value[1]) then
792  Result := Copy(Value, 2, Length(Value) - 2)
793  else
794  Result := Value;
795 end;
796 
797 { TZBasicCommentState }
798 
799 {**
800  Either delegate to a comment-handling state, or return a
801  token with just a slash in it.
802 
803  @return either just a slash token, or the results of
804  delegating to a comment-handling state
805 }
806 function TZCommentState.NextToken(Stream: TStream; FirstChar: Char;
807  Tokenizer: TZTokenizer): TZToken;
808 var
809  ReadChar: Char;
810  ReadStr: string;
811 begin
812  ReadStr := FirstChar;
813  while (Stream.Read(ReadChar, 1 * SizeOf(Char)) > 0) and not CharInSet(ReadChar, [#10, #13]) do
814  ReadStr := ReadStr + ReadChar;
815  if CharInSet(ReadChar, [#10, #13]) then
816  Stream.Seek(-(1 * SizeOf(Char)), soFromCurrent);
817 
818  Result.TokenType := ttComment;
819  Result.Value := ReadStr;
820 end;
821 
822 { TZCppCommentState }
823 
824 {**
825  Ignore everything up to a closing star and slash, and
826  then return the tokenizer's next token.
827  @return the tokenizer's next token
828 }
829 function TZCppCommentState.GetMultiLineComment(Stream: TStream): string;
830 var
831  ReadChar, LastChar: Char;
832 begin
833  LastChar := #0;
834  Result := '';
835  while Stream.Read(ReadChar, 1 * SizeOf(Char)) > 0 do
836  begin
837  Result := Result + ReadChar;
838  if (LastChar = '*') and (ReadChar = '/') then
839  Break;
840  LastChar := ReadChar;
841  end;
842 end;
843 
844 {**
845  Ignore everything up to an end-of-line and return the tokenizer's next token.
846  @return the tokenizer's next token
847 }
848 function TZCppCommentState.GetSingleLineComment(Stream: TStream): string;
849 var
850  ReadChar: Char;
851 begin
852  Result := '';
853  while (Stream.Read(ReadChar, 1 * SizeOf(Char)) > 0) and not CharInSet(ReadChar, [#10, #13]) do
854  Result := Result + ReadChar;
855 
856  // mdaems : for single line comments the line ending must be included
857  // as it should never be stripped off or unified with other whitespace characters
858  if CharInSet(ReadChar, [#10, #13]) then
859  begin
860  Result := Result + ReadChar;
861  // ludob Linux line terminator is just LF, don't read further if we already have LF
862  if (ReadChar<>#10) and (Stream.Read(ReadChar, 1 * SizeOf(Char)) > 0) then
863  if CharInSet(ReadChar, [#10, #13]) then
864  Result := Result + ReadChar
865  else
866  Stream.Seek(-(1 * SizeOf(Char)), soFromCurrent);
867  end;
868 end;
869 
870 {**
871  Either delegate to a comment-handling state, or return a
872  token with just a slash in it.
873 
874  @return either just a slash token, or the results of
875  delegating to a comment-handling state
876 }
877 function TZCppCommentState.NextToken(Stream: TStream; FirstChar: Char;
878  Tokenizer: TZTokenizer): TZToken;
879 var
880  ReadChar: Char;
881  ReadNum: Integer;
882 begin
883  Result.TokenType := ttUnknown;
884  Result.Value := FirstChar;
885 
886  ReadNum := Stream.Read(ReadChar, 1 * SizeOf(Char));
887  if (ReadNum > 0) and (ReadChar = '*') then
888  begin
889  Result.TokenType := ttComment;
890  Result.Value := '/*' + GetMultiLineComment(Stream);
891  end
892  else if (ReadNum > 0) and (ReadChar = '/') then
893  begin
894  Result.TokenType := ttComment;
895  Result.Value := '//' + GetSingleLineComment(Stream);
896  end
897  else
898  begin
899  if ReadNum > 0 then
900  Stream.Seek(-(1 * SizeOf(Char)), soFromCurrent);
901  if Tokenizer.SymbolState <> nil then
902  Result := Tokenizer.SymbolState.NextToken(Stream, FirstChar, Tokenizer);
903  end;
904 end;
905 
906 { TZCCommentState }
907 
908 {**
909  Gets a C specific comments like /* */.
910  @return either just a slash token, or the results of
911  delegating to a comment-handling state
912 }
913 function TZCCommentState.NextToken(Stream: TStream; FirstChar: Char;
914  Tokenizer: TZTokenizer): TZToken;
915 var
916  ReadChar: Char;
917  ReadNum: Integer;
918 begin
919  Result.TokenType := ttUnknown;
920  Result.Value := FirstChar;
921 
922  if FirstChar = '/' then
923  begin
924  ReadNum := Stream.Read(ReadChar, 1 * SizeOf(Char));
925  if (ReadNum > 0) and (ReadChar = '*') then
926  begin
927  Result.TokenType := ttComment;
928  Result.Value := '/*' + GetMultiLineComment(Stream);
929  end
930  else
931  begin
932  if ReadNum > 0 then
933  Stream.Seek(-(1 * SizeOf(Char)), soFromCurrent);
934  end;
935  end;
936 
937  if (Result.TokenType = ttUnknown) and (Tokenizer.SymbolState <> nil) then
938  Result := Tokenizer.SymbolState.NextToken(Stream, FirstChar, Tokenizer);
939 end;
940 
941 { TZSymbolNode }
942 
943 {**
944  Constructs a SymbolNode with the given parent, representing
945  the given character.
946  @param Parent this node's parent
947  @param Character this node's character
948 }
949 constructor TZSymbolNode.Create(Parent: TZSymbolNode; Character: Char);
950 begin
951  FParent := Parent;
952  FCharacter := Character;
953  FValid := False;
954  SetLength(FChildren, 256);
955 end;
956 
957 {**
958  Destroys this symbol object and cleanups the memory.
959 }
960 destructor TZSymbolNode.Destroy;
961 var
962  I: Integer;
963 begin
964  for I := 0 to 255 do
965  begin
966  if FChildren[I] <> nil then
967  FChildren[I].Free
968  else
969  Break;
970  end;
971  SetLength(FChildren, 0);
972  FParent := nil;
973  inherited Destroy;
974 end;
975 
976 {**
977  Add a line of descendants that represent the characters in the given string.
978 }
979 procedure TZSymbolNode.AddDescendantLine(const Value: string);
980 var
981  Node: TZSymbolNode;
982 begin
983  if Length(Value) > 0 then
984  begin
985  Node := EnsureChildWithChar(Value[1]);
986  Node.AddDescendantLine(Copy(Value, 2, Length(Value) - 1));
987  end;
988 end;
989 
990 {**
991  Show the symbol this node represents.
992  @return the symbol this node represents
993 }
994 function TZSymbolNode.Ancestry: string;
995 begin
996  Result := FParent.Ancestry + FCharacter;
997 end;
998 
999 {**
1000  Find the descendant that takes as many characters as possible from the input.
1001 }
1002 function TZSymbolNode.DeepestRead(Stream: TStream): TZSymbolNode;
1003 var
1004  TempChar: Char;
1005  Node: TZSymbolNode;
1006  ReadNum: Integer;
1007 begin
1008  ReadNum := Stream.Read(TempChar, 1 * SizeOf(Char));
1009  if ReadNum > 0 then
1010  Node := FindChildWithChar(TempChar)
1011  else
1012  Node := nil;
1013 
1014  if Node = nil then
1015  begin
1016  Stream.Seek(-ReadNum, soFromCurrent);
1017  Result := Self;
1018  end
1019  else
1020  Result := Node.DeepestRead(Stream);
1021 end;
1022 
1023 {**
1024  Find or create a child for the given character.
1025 }
1026 function TZSymbolNode.EnsureChildWithChar(Value: Char): TZSymbolNode;
1027 var
1028  N: Integer;
1029 begin
1030  Result := FindChildWithChar(Value);
1031  if Result = nil then
1032  begin
1033  N := 0;
1034  while (FChildren[N] <> nil) and (N <= 255) do
1035  Inc(N);
1036  if N <= 255 then
1037  begin
1038  Result := TZSymbolNode.Create(Self, Value);
1039  FChildren[N] := Result;
1040  end;
1041  end;
1042 end;
1043 
1044 {**
1045  Find a child with the given character.
1046 }
1047 function TZSymbolNode.FindChildWithChar(Value: Char): TZSymbolNode;
1048 var
1049  I: Integer;
1050  Current: TZSymbolNode;
1051 begin
1052  Result := nil;
1053  for I := 0 to 255 do
1054  begin
1055  Current := Children[I];
1056  if (Current = nil) or (Current.Character = Value) then
1057  begin
1058  Result := Current;
1059  Break;
1060  end;
1061  end;
1062 end;
1063 
1064 {**
1065  Find a descendant which is down the path the given string indicates.
1066 }
1067 function TZSymbolNode.FindDescendant(const Value: string): TZSymbolNode;
1068 var
1069  TempChar: Char;
1070 begin
1071  if Length(Value) > 0 then
1072  TempChar := Value[1]
1073  else
1074  TempChar := #0;
1075  Result := FindChildWithChar(TempChar);
1076  if (Length(Value) > 1) and (Result <> nil) then
1077  Result := Result.FindDescendant(Copy(Value, 2, Length(Value) - 1));
1078 end;
1079 
1080 {**
1081  Unwind to a valid node; this node is "valid" if its
1082  ancestry represents a complete symbol. If this node is
1083  not valid, put back the character and ask the parent to unwind.
1084 }
1085 function TZSymbolNode.UnreadToValid(Stream: TStream): TZSymbolNode;
1086 begin
1087  if not FValid then
1088  begin
1089  Stream.Seek(-(1 * SizeOf(Char)), soFromCurrent);
1090  Result := FParent.UnreadToValid(Stream);
1091  end
1092  else
1093  Result := Self;
1094 end;
1095 
1096 { TZSymbolRootNode }
1097 
1098 {**
1099  Create and initialize a root node.
1100 }
1101 constructor TZSymbolRootNode.Create;
1102 var
1103  I: Integer;
1104 begin
1105  inherited Create(nil, #0);
1106 
1107  for I := 0 to 255 do
1108  begin
1109  FChildren[I] := TZSymbolNode.Create(Self, Chr(I));
1110  FChildren[I].Valid := True;
1111  end;
1112 end;
1113 
1114 {**
1115  Add the given string as a symbol.
1116  @param String the character sequence to add
1117 }
1118 procedure TZSymbolRootNode.Add(const Value: string);
1119 var
1120  TempChar: Char;
1121  Node: TZSymbolNode;
1122 begin
1123  if Length(Value) > 0 then
1124  TempChar := Value[1]
1125  else
1126  TempChar := #0;
1127  Node := EnsureChildWithChar(TempChar);
1128  Node.AddDescendantLine(Copy(Value, 2, Length(Value) - 1));
1129  FindDescendant(Value).Valid := True;
1130 end;
1131 
1132 {**
1133  A root node has no parent and no character of its own, so its ancestry is "".
1134  @return an empty string
1135 }
1136 function TZSymbolRootNode.Ancestry: string;
1137 begin
1138  Result := '';
1139 end;
1140 
1141 {**
1142  A root node maintains its children in an array instead of
1143  a Vector, to be faster.
1144 }
1145 function TZSymbolRootNode.FindChildWithChar(Value: Char): TZSymbolNode;
1146 begin
1147  Result := FChildren[Ord(Value)];
1148 end;
1149 
1150 {**
1151  Return a symbol string from a reader.
1152 
1153  @param Stream a reader to read from
1154  @param FirstChar the first character of this symbol, already
1155  read from the reader
1156  @return a symbol string from a reader
1157 }
1158 function TZSymbolRootNode.NextSymbol(Stream: TStream; FirstChar: Char): string;
1159 var
1160  Node: TZSymbolNode;
1161 begin
1162  Node := FindChildWithChar(FirstChar);
1163  Node := Node.DeepestRead(Stream);
1164  Node := Node.UnreadToValid(Stream);
1165  Result := Node.Ancestry;
1166 end;
1167 
1168 { TZSymbolState }
1169 
1170 {**
1171  Constructs a symbol state with a default idea of what
1172  multi-character symbols to accept (as described in the class comment).
1173 }
1174 constructor TZSymbolState.Create;
1175 begin
1176  FSymbols := TZSymbolRootNode.Create;
1177 end;
1178 
1179 {**
1180  Destroys this object and cleanups the memory.
1181 }
1182 destructor TZSymbolState.Destroy;
1183 begin
1184  FSymbols.Free;
1185  inherited Destroy;
1186 end;
1187 
1188 {**
1189  Add a multi-character symbol.
1190  @param Value the symbol to add, such as "=:="
1191 }
1192 procedure TZSymbolState.Add(const Value: string);
1193 begin
1194  FSymbols.Add(Value);
1195 end;
1196 
1197 {**
1198  Return a symbol token from a reader.
1199  @return a symbol token from a reader
1200 }
1201 function TZSymbolState.NextToken(Stream: TStream; FirstChar: Char;
1202  Tokenizer: TZTokenizer): TZToken;
1203 begin
1204  Result.TokenType := ttSymbol;
1205  Result.Value := FSymbols.NextSymbol(Stream, FirstChar);
1206 end;
1207 
1208 { TZWhitespaceState }
1209 
1210 {**
1211  Constructs a whitespace state with a default idea of what
1212  characters are, in fact, whitespace.
1213 }
1214 constructor TZWhitespaceState.Create;
1215 begin
1216  SetWhitespaceChars(' ', high(char), False);
1217  SetWhitespaceChars(Chr(0), ' ', True);
1218 end;
1219 
1220 {**
1221  Ignore whitespace (such as blanks and tabs), and return
1222  the tokenizer's next token.
1223  @return the tokenizer's next token
1224 }
1225 function TZWhitespaceState.NextToken(Stream: TStream; FirstChar: Char;
1226  Tokenizer: TZTokenizer): TZToken;
1227 var
1228  ReadNum: Integer;
1229  ReadChar: Char;
1230  ReadStr: string;
1231 begin
1232  ReadStr := FirstChar;
1233  ReadNum := 0;
1234  while True do
1235  begin
1236  ReadNum := Stream.Read(ReadChar, 1 * SizeOf(Char));
1237  if (ReadNum = 0) or not FWhitespaceChars[Ord(ReadChar)] then
1238  Break;
1239  ReadStr := ReadStr + ReadChar;
1240  end;
1241 
1242  if ReadNum > 0 then
1243  Stream.Seek(-(1 * SizeOf(Char)), soFromCurrent);
1244  Result.TokenType := ttWhitespace;
1245  Result.Value := ReadStr;
1246 end;
1247 
1248 {**
1249  Establish the given characters as whitespace to ignore.
1250  @param FromChar first character index.
1251  @param ToChar last character index.
1252  @param Enable true, if this state should ignore characters in the given range
1253 }
1254 procedure TZWhitespaceState.SetWhitespaceChars(FromChar, ToChar: Char;
1255  Enable: Boolean);
1256 var
1257  I: Integer;
1258 begin
1259  for I := Ord(FromChar) to MinIntValue([Ord(ToChar), 255]) do
1260  FWhitespaceChars[I] := Enable;
1261 end;
1262 
1263 { TZWordState }
1264 
1265 {**
1266  Constructs a word state with a default idea of what characters
1267  are admissible inside a word (as described in the class comment).
1268 }
1269 constructor TZWordState.Create;
1270 begin
1271  SetWordChars(#0, #191, False);
1272  SetWordChars(#192, high(char), True);
1273  SetWordChars('a', 'z', True);
1274  SetWordChars('A', 'Z', True);
1275  SetWordChars('0', '9', True);
1276  SetWordChars('-', '-', True);
1277  SetWordChars('_', '_', True);
1278  SetWordChars('''', '''', True);
1279 end;
1280 
1281 {**
1282  Return a word token from a reader.
1283  @return a word token from a reader
1284 }
1285 function TZWordState.NextToken(Stream: TStream; FirstChar: Char;
1286  Tokenizer: TZTokenizer): TZToken;
1287 var
1288  TempChar: Char;
1289  ReadNum: Integer;
1290  Value: string;
1291 begin
1292  Value := FirstChar;
1293  repeat
1294  ReadNum := Stream.Read(TempChar, 1 * SizeOf(Char));
1295  if (ReadNum = 0) or not FWordChars[Ord(TempChar)] then
1296  Break;
1297  Value := Value + TempChar;
1298  until False;
1299 
1300  if ReadNum > 0 then
1301  Stream.Seek(-(1 * SizeOf(Char)), soFromCurrent);
1302  Result.TokenType := ttWord;
1303  Result.Value := Value;
1304 end;
1305 
1306 {**
1307  Establish characters in the given range as valid
1308  characters for part of a word after the first character.
1309  Note that the tokenizer must determine which characters
1310  are valid as the beginning character of a word.
1311  @param FromChar first character index.
1312  @param ToChar last character index.
1313  @param Enable true, if this state should ignore characters in the given range
1314 }
1315 procedure TZWordState.SetWordChars(FromChar, ToChar: Char; Enable: Boolean);
1316 var
1317  I: Integer;
1318 begin
1319  for I := Ord(FromChar) to MinIntValue([Ord(ToChar), Ord(high(char)) ]) do
1320  FWordChars[I] := Enable;
1321 end;
1322 
1323 { TZTokenizer }
1324 
1325 {**
1326  Constructs a tokenizer with a default state table (as
1327  described in the class comment).
1328 }
1329 constructor TZTokenizer.Create;
1330 begin
1331  FSymbolState := TZSymbolState.Create;
1332  with TZSymbolState(FSymbolState) do
1333  begin
1334  Add('<>');
1335  Add('<=');
1336  Add('>=');
1337  end;
1338  FEscapeState := TZEscapeState.Create;
1339  FNumberState := TZNumberState.Create;
1340  FQuoteState := TZQuoteState.Create;
1341  FWhitespaceState := TZWhitespaceState.Create;
1342  FWordState := TZWordState.Create;
1343  FCommentState := TZCppCommentState.Create;
1344 
1345  SetCharacterState(#0, #32, FWhitespaceState);
1346  SetCharacterState(#33, #191, FSymbolState);
1347  SetCharacterState(#192, High(Char), FWordState);
1348 
1349  SetCharacterState('a', 'z', FWordState);
1350  SetCharacterState('A', 'Z', FWordState);
1351  SetCharacterState('0', '9', FNumberState);
1352  SetCharacterState('-', '-', FNumberState);
1353  SetCharacterState('.', '.', FNumberState);
1354  SetCharacterState('"', '"', FQuoteState);
1355  SetCharacterState('''', '''', FQuoteState);
1356  SetCharacterState('/', '/', FCommentState);
1357 end;
1358 
1359 {**
1360  Destroys this object and cleanups the memory.
1361 }
1362 destructor TZTokenizer.Destroy;
1363 begin
1364  if FEscapeState <> nil then
1365  FEscapeState.Free;
1366  if FCommentState <> nil then
1367  FCommentState.Free;
1368  if FNumberState <> nil then
1369  FNumberState.Free;
1370  if FQuoteState <> nil then
1371  FQuoteState.Free;
1372  if FSymbolState <> nil then
1373  FSymbolState.Free;
1374  if FWhitespaceState <> nil then
1375  FWhitespaceState.Free;
1376  if FWordState <> nil then
1377  FWordState.Free;
1378 
1379  inherited Destroy;
1380 end;
1381 
1382 {**
1383  Gets an initial state object for the specified character.
1384  @return an initial state object for the character.
1385 }
1386 function TZTokenizer.GetCharacterState(StartChar: Char): TZTokenizerState;
1387 begin
1388  Result := FCharacterStates[Ord(StartChar)];
1389 end;
1390 
1391 {**
1392  Change the state the tokenizer will enter upon reading
1393  any character between "from" and "to".
1394 
1395  @param FromChar first character index.
1396  @param ToChar last character index.
1397  @param State the state to enter upon reading a
1398  character between "fromChar" and "toChar"
1399 }
1400 procedure TZTokenizer.SetCharacterState(FromChar, ToChar: Char;
1401  State: TZTokenizerState);
1402 var
1403  I: Integer;
1404 const
1405  ORDMAXCHAR = ord(high(char));
1406 begin
1407  for I := Ord(FromChar) to MinIntValue([Ord(ToChar), ORDMAXCHAR]) do
1408  FCharacterStates[I] := State;
1409 end;
1410 
1411 {**
1412  Tokenizes a string buffer into a dynamic array of tokens.
1413  @param Buffer a string buffer to be tokenized.
1414  @param Options a set of tokenizer options.
1415  @returns a dynamic array of tokens
1416 }
1417 function TZTokenizer.TokenizeBuffer(const Buffer: string;
1418  Options: TZTokenOptions): TZTokenDynArray;
1419 var
1420  Stream: TStream;
1421 begin
1422  Stream := TStringStream.Create(Buffer{$IFDEF WITH_TENCODING_CLASS}, TEncoding.Unicode{$ENDIF});
1423  try
1424  Result := TokenizeStream(Stream, Options);
1425  finally
1426  Stream.Free;
1427  end;
1428 end;
1429 
1430 function TZTokenizer.AnsiGetEscapeString(const EscapeString: RawByteString): String;
1431 var
1432  Temp: String;
1433 begin
1434  Temp := EscapeMarkSequence+IntToStr(Length(EscapeString))+ReverseString(EscapeMarkSequence);
1435 
1436  if Length(EscapeString) > 0 then
1437  Result := Temp+String(EscapeString)+Temp
1438  else
1439  Result := 'NULL';
1440 end;
1441 
1442 {$IF defined(FPC) and defined(WITH_RAWBYTESTRING)}
1443 function TZTokenizer.GetEscapeString(const EscapeString: RawByteString): RawByteString;
1444 {$ELSE}
1445 function TZTokenizer.GetEscapeString(const EscapeString: String): String;
1446 {$IFEND}
1447 var
1448  Temp: String;
1449 begin
1450  Temp := EscapeMarkSequence+IntToStr(Length(EscapeString))+ReverseString(EscapeMarkSequence);
1451 
1452  if Length(EscapeString) > 0 then
1453  {$IF defined(FPC) and defined(WITH_RAWBYTESTRING)}
1454  Result := RawByteString(Temp)+EscapeString+RawByteString(Temp)
1455  {$ELSE}
1456  Result := Temp+EscapeString+Temp
1457  {$IFEND}
1458  else
1459  Result := '';
1460 end;
1461 
1462 {**
1463  EgonHugeist:
1464  Checks if SymboState is EscapeState and sets it ...
1465  @param Stream the Read-Stream which has to checked for Next-Chars.
1466  @FirstChar The FirstChar which was readed and sets the Symbolstate
1467  @returns either the given SymbolState or the EscapeState
1468 }
1469 function TZTokenizer.CheckEscapeState(const ActualState: TZTokenizerState;
1470  Stream: TStream; const FirstChar: Char): TZTokenizerState;
1471 var
1472  NextChar: Char;
1473  iReadCount, I: Integer;
1474 begin
1475  Result := ActualState;
1476  iReadCount := 0;
1477  if ( FirstChar = EscapeMarkSequence[1]) then //Token was set so check if its Escape
1478  begin
1479  for i := 2 to Length(EscapeMarkSequence) do
1480  if Stream.Read(NextChar, 1 * SizeOf(Char)) > 0 then //Read next Char
1481  begin
1482  Inc(IReadCount); //increment count of read-Chars
1483  if NextChar <> EscapeMarkSequence[I] then //Compare Chars
1484  begin
1485  Stream.Seek(-(iReadCount * SizeOf(Char)), soFromCurrent); //Seek Stream back to starting Position
1486  Exit;
1487  end
1488  end
1489  else Continue;
1490  end
1491  else
1492  Exit;
1493  Stream.Seek(-(iReadCount * SizeOf(Char)), soFromCurrent); //Seek Stream back to starting Position
1494  Result := Self.EscapeState;
1495 end;
1496 
1497 {**
1498  Tokenizes a string buffer into a list of tokens.
1499  @param Buffer a string buffer to be tokenized.
1500  @param Options a set of tokenizer options.
1501  @returns a string list where Items are tokens and
1502  Objects are token types.
1503 }
1504 function TZTokenizer.TokenizeBufferToList(const Buffer: string;
1505  Options: TZTokenOptions): TStrings;
1506 var
1507  Stream: TStream;
1508 begin
1509  Stream := TStringStream.Create(Buffer{$IFDEF WITH_TENCODING_CLASS}, TEncoding.Unicode{$ENDIF});
1510  try
1511  Result := TokenizeStreamToList(Stream, Options);
1512  finally
1513  Stream.Free;
1514  end;
1515 end;
1516 
1517 {**
1518  Tokenizes a stream into a dynamic array of tokens.
1519  @param Stream a stream to be tokenized.
1520  @param Options a set of tokenizer options.
1521  @returns a dynamic array of tokens
1522 }
1523 function TZTokenizer.TokenizeStream(Stream: TStream;
1524  Options: TZTokenOptions): TZTokenDynArray;
1525 var
1526  I: Integer;
1527  List: TStrings;
1528 begin
1529  List := TokenizeStreamToList(Stream, Options);
1530  try
1531  SetLength(Result, List.Count);
1532  for I := 0 to List.Count - 1 do
1533  begin
1534  Result[I].Value := List[I];
1535  Result[I].TokenType := TZTokenType({$IFDEF FPC}Pointer({$ENDIF}
1536  List.Objects[I]{$IFDEF FPC}){$ENDIF});
1537  end;
1538  finally
1539  List.Free;
1540  end;
1541 end;
1542 
1543 {**
1544  Tokenizes a stream into a string list of tokens.
1545  @param Stream a stream to be tokenized.
1546  @param Options a set of tokenizer options.
1547  @returns a string list where Items are tokens and
1548  Objects are token types.
1549 }
1550 function TZTokenizer.TokenizeStreamToList(Stream: TStream;
1551  Options: TZTokenOptions): TStrings;
1552 var
1553  FirstChar: Char;
1554  Token: TZToken;
1555  LastTokenType: TZTokenType;
1556  State: TZTokenizerState;
1557 begin
1558  Result := TStringList.Create;
1559  LastTokenType := ttUnknown;
1560 
1561  while Stream.Read(FirstChar, 1 * SizeOf(Char)) > 0 do
1562  begin
1563  State := FCharacterStates[Ord(FirstChar)];
1564  if State <> nil then
1565  begin
1566  State := CheckEscapeState(State, Stream, FirstChar);
1567 
1568  Token := State.NextToken(Stream, FirstChar, Self);
1569  { Decode strings. }
1570  if (State is TZQuoteState)
1571  and (toDecodeStrings in Options) then
1572  begin
1573  Token.Value := (State as TZQuoteState).DecodeString(
1574  Token.Value, FirstChar);
1575  end;
1576  { Skips comments if option set. }
1577  if (Token.TokenType = ttComment)
1578  and (toSkipComments in Options) then
1579  Continue;
1580  { Skips whitespaces if option set. }
1581  if (Token.TokenType = ttWhitespace)
1582  and (toSkipWhitespaces in Options) then
1583  Continue;
1584  { Unifies whitespaces if option set. }
1585  if (Token.TokenType = ttWhitespace)
1586  and (toUnifyWhitespaces in Options) then
1587  begin
1588  if LastTokenType = ttWhitespace then
1589  Continue;
1590  Token.Value := ' ';
1591  end;
1592  { Unifies numbers if option set. }
1593  if (Token.TokenType in [ttInteger, ttFloat, ttHexDecimal])
1594  and (toUnifyNumbers in Options) then
1595  Token.TokenType := ttNumber;
1596  { If an integer is immediately followed by a string they should be seen as one string}
1597  if ((Token.TokenType = ttWord)and(LastTokenType = ttInteger)) then
1598  begin
1599  Token.Value := Result[Result.Count-1] + Token.Value;
1600  Result.Delete(Result.Count-1);
1601  end;
1602  { Add a read token. }
1603  LastTokenType := Token.TokenType;
1604  Result.AddObject(Token.Value, TObject(Ord(Token.TokenType)));
1605  end
1606  { Skips unknown chars if option set. }
1607  else if not (toSkipUnknown in Options) then
1608  Result.AddObject(FirstChar, TObject(Ord(ttUnknown)));
1609  end;
1610  { Adds an EOF if option is not set. }
1611  if not (toSkipEOF in Options) then
1612  Result.AddObject('', TObject(Ord(ttEOF)));
1613 end;
1614 
1615 {**
1616  Gets a tokenizer default Escape state.
1617  @returns a tokenizer default Escape state.
1618 }
1619 function TZTokenizer.GetEscapeState: TZEscapeState;
1620 begin
1621  Result := EscapeState;
1622 end;
1623 
1624 {**
1625  Gets a tokenizer default comment state.
1626  @returns a tokenizer default comment state.
1627 }
1628 function TZTokenizer.GetCommentState: TZCommentState;
1629 begin
1630  Result := CommentState;
1631 end;
1632 
1633 {**
1634  Gets a tokenizer default number state.
1635  @returns a tokenizer default number state.
1636 }
1637 function TZTokenizer.GetNumberState: TZNumberState;
1638 begin
1639  Result := NumberState;
1640 end;
1641 
1642 {**
1643  Gets a tokenizer default quote state.
1644  @returns a tokenizer default quote state.
1645 }
1646 function TZTokenizer.GetQuoteState: TZQuoteState;
1647 begin
1648  Result := QuoteState;
1649 end;
1650 
1651 {**
1652  Gets a tokenizer default symbol state.
1653  @returns a tokenizer default symbol state.
1654 }
1655 function TZTokenizer.GetSymbolState: TZSymbolState;
1656 begin
1657  Result := SymbolState;
1658 end;
1659 
1660 {**
1661  Gets a tokenizer default whitespace state.
1662  @returns a tokenizer default whitespace state.
1663 }
1664 function TZTokenizer.GetWhitespaceState: TZWhitespaceState;
1665 begin
1666  Result := WhitespaceState;
1667 end;
1668 
1669 {**
1670  Gets a tokenizer default word state.
1671  @returns a tokenizer default word state.
1672 }
1673 function TZTokenizer.GetWordState: TZWordState;
1674 begin
1675  Result := WordState;
1676 end;
1677 
1678 end.
1679