Tokenizer.php 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. <?php declare(strict_types = 1);
  2. namespace TheSeer\Tokenizer;
  3. class Tokenizer {
  4. /**
  5. * Token Map for "non-tokens"
  6. *
  7. * @var array
  8. */
  9. private $map = [
  10. '(' => 'T_OPEN_BRACKET',
  11. ')' => 'T_CLOSE_BRACKET',
  12. '[' => 'T_OPEN_SQUARE',
  13. ']' => 'T_CLOSE_SQUARE',
  14. '{' => 'T_OPEN_CURLY',
  15. '}' => 'T_CLOSE_CURLY',
  16. ';' => 'T_SEMICOLON',
  17. '.' => 'T_DOT',
  18. ',' => 'T_COMMA',
  19. '=' => 'T_EQUAL',
  20. '<' => 'T_LT',
  21. '>' => 'T_GT',
  22. '+' => 'T_PLUS',
  23. '-' => 'T_MINUS',
  24. '*' => 'T_MULT',
  25. '/' => 'T_DIV',
  26. '?' => 'T_QUESTION_MARK',
  27. '!' => 'T_EXCLAMATION_MARK',
  28. ':' => 'T_COLON',
  29. '"' => 'T_DOUBLE_QUOTES',
  30. '@' => 'T_AT',
  31. '&' => 'T_AMPERSAND',
  32. '%' => 'T_PERCENT',
  33. '|' => 'T_PIPE',
  34. '$' => 'T_DOLLAR',
  35. '^' => 'T_CARET',
  36. '~' => 'T_TILDE',
  37. '`' => 'T_BACKTICK'
  38. ];
  39. public function parse(string $source): TokenCollection {
  40. $result = new TokenCollection();
  41. if ($source === '') {
  42. return $result;
  43. }
  44. $tokens = \token_get_all($source);
  45. $lastToken = new Token(
  46. $tokens[0][2],
  47. 'Placeholder',
  48. ''
  49. );
  50. foreach ($tokens as $pos => $tok) {
  51. if (\is_string($tok)) {
  52. $token = new Token(
  53. $lastToken->getLine(),
  54. $this->map[$tok],
  55. $tok
  56. );
  57. $result->addToken($token);
  58. $lastToken = $token;
  59. continue;
  60. }
  61. $line = $tok[2];
  62. $values = \preg_split('/\R+/Uu', $tok[1]);
  63. if (!$values) {
  64. $result->addToken(
  65. new Token(
  66. $line,
  67. \token_name($tok[0]),
  68. '{binary data}'
  69. )
  70. );
  71. continue;
  72. }
  73. foreach ($values as $v) {
  74. $token = new Token(
  75. $line,
  76. \token_name($tok[0]),
  77. $v
  78. );
  79. $lastToken = $token;
  80. $line++;
  81. if ($v === '') {
  82. continue;
  83. }
  84. $result->addToken($token);
  85. }
  86. }
  87. return $this->fillBlanks($result, $lastToken->getLine());
  88. }
  89. private function fillBlanks(TokenCollection $tokens, int $maxLine): TokenCollection {
  90. $prev = new Token(
  91. 0,
  92. 'Placeholder',
  93. ''
  94. );
  95. $final = new TokenCollection();
  96. foreach ($tokens as $token) {
  97. $gap = $token->getLine() - $prev->getLine();
  98. while ($gap > 1) {
  99. $linebreak = new Token(
  100. $prev->getLine() + 1,
  101. 'T_WHITESPACE',
  102. ''
  103. );
  104. $final->addToken($linebreak);
  105. $prev = $linebreak;
  106. $gap--;
  107. }
  108. $final->addToken($token);
  109. $prev = $token;
  110. }
  111. $gap = $maxLine - $prev->getLine();
  112. while ($gap > 0) {
  113. $linebreak = new Token(
  114. $prev->getLine() + 1,
  115. 'T_WHITESPACE',
  116. ''
  117. );
  118. $final->addToken($linebreak);
  119. $prev = $linebreak;
  120. $gap--;
  121. }
  122. return $final;
  123. }
  124. }