AbstractLexer.php 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328
  1. <?php
  2. declare(strict_types=1);
  3. namespace Doctrine\Common\Lexer;
  4. use ReflectionClass;
  5. use UnitEnum;
  6. use function implode;
  7. use function preg_split;
  8. use function sprintf;
  9. use function substr;
  10. use const PREG_SPLIT_DELIM_CAPTURE;
  11. use const PREG_SPLIT_NO_EMPTY;
  12. use const PREG_SPLIT_OFFSET_CAPTURE;
  13. /**
  14. * Base class for writing simple lexers, i.e. for creating small DSLs.
  15. *
  16. * @template T of UnitEnum|string|int
  17. * @template V of string|int
  18. */
  19. abstract class AbstractLexer
  20. {
  21. /**
  22. * Lexer original input string.
  23. */
  24. private string $input;
  25. /**
  26. * Array of scanned tokens.
  27. *
  28. * @var list<Token<T, V>>
  29. */
  30. private array $tokens = [];
  31. /**
  32. * Current lexer position in input string.
  33. */
  34. private int $position = 0;
  35. /**
  36. * Current peek of current lexer position.
  37. */
  38. private int $peek = 0;
  39. /**
  40. * The next token in the input.
  41. *
  42. * @var Token<T, V>|null
  43. */
  44. public Token|null $lookahead;
  45. /**
  46. * The last matched/seen token.
  47. *
  48. * @var Token<T, V>|null
  49. */
  50. public Token|null $token;
  51. /**
  52. * Composed regex for input parsing.
  53. *
  54. * @var non-empty-string|null
  55. */
  56. private string|null $regex = null;
  57. /**
  58. * Sets the input data to be tokenized.
  59. *
  60. * The Lexer is immediately reset and the new input tokenized.
  61. * Any unprocessed tokens from any previous input are lost.
  62. *
  63. * @param string $input The input to be tokenized.
  64. *
  65. * @return void
  66. */
  67. public function setInput(string $input)
  68. {
  69. $this->input = $input;
  70. $this->tokens = [];
  71. $this->reset();
  72. $this->scan($input);
  73. }
  74. /**
  75. * Resets the lexer.
  76. *
  77. * @return void
  78. */
  79. public function reset()
  80. {
  81. $this->lookahead = null;
  82. $this->token = null;
  83. $this->peek = 0;
  84. $this->position = 0;
  85. }
  86. /**
  87. * Resets the peek pointer to 0.
  88. *
  89. * @return void
  90. */
  91. public function resetPeek()
  92. {
  93. $this->peek = 0;
  94. }
  95. /**
  96. * Resets the lexer position on the input to the given position.
  97. *
  98. * @param int $position Position to place the lexical scanner.
  99. *
  100. * @return void
  101. */
  102. public function resetPosition(int $position = 0)
  103. {
  104. $this->position = $position;
  105. }
  106. /**
  107. * Retrieve the original lexer's input until a given position.
  108. *
  109. * @return string
  110. */
  111. public function getInputUntilPosition(int $position)
  112. {
  113. return substr($this->input, 0, $position);
  114. }
  115. /**
  116. * Checks whether a given token matches the current lookahead.
  117. *
  118. * @param T $type
  119. *
  120. * @return bool
  121. *
  122. * @psalm-assert-if-true !=null $this->lookahead
  123. */
  124. public function isNextToken(int|string|UnitEnum $type)
  125. {
  126. return $this->lookahead !== null && $this->lookahead->isA($type);
  127. }
  128. /**
  129. * Checks whether any of the given tokens matches the current lookahead.
  130. *
  131. * @param list<T> $types
  132. *
  133. * @return bool
  134. *
  135. * @psalm-assert-if-true !=null $this->lookahead
  136. */
  137. public function isNextTokenAny(array $types)
  138. {
  139. return $this->lookahead !== null && $this->lookahead->isA(...$types);
  140. }
  141. /**
  142. * Moves to the next token in the input string.
  143. *
  144. * @return bool
  145. *
  146. * @psalm-assert-if-true !null $this->lookahead
  147. */
  148. public function moveNext()
  149. {
  150. $this->peek = 0;
  151. $this->token = $this->lookahead;
  152. $this->lookahead = isset($this->tokens[$this->position])
  153. ? $this->tokens[$this->position++] : null;
  154. return $this->lookahead !== null;
  155. }
  156. /**
  157. * Tells the lexer to skip input tokens until it sees a token with the given value.
  158. *
  159. * @param T $type The token type to skip until.
  160. *
  161. * @return void
  162. */
  163. public function skipUntil(int|string|UnitEnum $type)
  164. {
  165. while ($this->lookahead !== null && ! $this->lookahead->isA($type)) {
  166. $this->moveNext();
  167. }
  168. }
  169. /**
  170. * Checks if given value is identical to the given token.
  171. *
  172. * @return bool
  173. */
  174. public function isA(string $value, int|string|UnitEnum $token)
  175. {
  176. return $this->getType($value) === $token;
  177. }
  178. /**
  179. * Moves the lookahead token forward.
  180. *
  181. * @return Token<T, V>|null The next token or NULL if there are no more tokens ahead.
  182. */
  183. public function peek()
  184. {
  185. if (isset($this->tokens[$this->position + $this->peek])) {
  186. return $this->tokens[$this->position + $this->peek++];
  187. }
  188. return null;
  189. }
  190. /**
  191. * Peeks at the next token, returns it and immediately resets the peek.
  192. *
  193. * @return Token<T, V>|null The next token or NULL if there are no more tokens ahead.
  194. */
  195. public function glimpse()
  196. {
  197. $peek = $this->peek();
  198. $this->peek = 0;
  199. return $peek;
  200. }
  201. /**
  202. * Scans the input string for tokens.
  203. *
  204. * @param string $input A query string.
  205. *
  206. * @return void
  207. */
  208. protected function scan(string $input)
  209. {
  210. if (! isset($this->regex)) {
  211. $this->regex = sprintf(
  212. '/(%s)|%s/%s',
  213. implode(')|(', $this->getCatchablePatterns()),
  214. implode('|', $this->getNonCatchablePatterns()),
  215. $this->getModifiers(),
  216. );
  217. }
  218. $flags = PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_OFFSET_CAPTURE;
  219. $matches = preg_split($this->regex, $input, -1, $flags);
  220. if ($matches === false) {
  221. // Work around https://bugs.php.net/78122
  222. $matches = [[$input, 0]];
  223. }
  224. foreach ($matches as $match) {
  225. // Must remain before 'value' assignment since it can change content
  226. $firstMatch = $match[0];
  227. $type = $this->getType($firstMatch);
  228. $this->tokens[] = new Token(
  229. $firstMatch,
  230. $type,
  231. $match[1],
  232. );
  233. }
  234. }
  235. /**
  236. * Gets the literal for a given token.
  237. *
  238. * @param T $token
  239. *
  240. * @return int|string
  241. */
  242. public function getLiteral(int|string|UnitEnum $token)
  243. {
  244. if ($token instanceof UnitEnum) {
  245. return $token::class . '::' . $token->name;
  246. }
  247. $className = static::class;
  248. $reflClass = new ReflectionClass($className);
  249. $constants = $reflClass->getConstants();
  250. foreach ($constants as $name => $value) {
  251. if ($value === $token) {
  252. return $className . '::' . $name;
  253. }
  254. }
  255. return $token;
  256. }
  257. /**
  258. * Regex modifiers
  259. *
  260. * @return string
  261. */
  262. protected function getModifiers()
  263. {
  264. return 'iu';
  265. }
  266. /**
  267. * Lexical catchable patterns.
  268. *
  269. * @return string[]
  270. */
  271. abstract protected function getCatchablePatterns();
  272. /**
  273. * Lexical non-catchable patterns.
  274. *
  275. * @return string[]
  276. */
  277. abstract protected function getNonCatchablePatterns();
  278. /**
  279. * Retrieve token type. Also processes the token value if necessary.
  280. *
  281. * @return T|null
  282. *
  283. * @param-out V $value
  284. */
  285. abstract protected function getType(string &$value);
  286. }