Viewing file: Lexer.php (19.28 KB) -rw-rw-rw- Select action/file-type: (+) | (+) | (+) | Code (+) | Session (+) | (+) | SDB (+) | (+) | (+) | (+) | (+) | (+) |
<?php /* vim: set expandtab tabstop=4 shiftwidth=4: */ // +----------------------------------------------------------------------+ // | Copyright (c) 2002-2004 Brent Cook | // +----------------------------------------------------------------------+ // | This library is free software; you can redistribute it and/or | // | modify it under the terms of the GNU Lesser General Public | // | License as published by the Free Software Foundation; either | // | version 2.1 of the License, or (at your option) any later version. | // | | // | This library is distributed in the hope that it will be useful, | // | but WITHOUT ANY WARRANTY; without even the implied warranty of | // | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | // | Lesser General Public License for more details. | // | | // | You should have received a copy of the GNU Lesser General Public | // | License along with this library; if not, write to the Free Software | // | Foundation, Inc., 59 Temple Place, Suite 330,Boston,MA 02111-1307 USA| // +----------------------------------------------------------------------+ // | Authors: Brent Cook <busterbcook@yahoo.com> | // | Jason Pell <jasonpell@hotmail.com> | // +----------------------------------------------------------------------+ // // $Id: Lexer.php,v 1.20 2004/05/07 12:33:35 busterb Exp $ //
include 'SQL/ctype.php';
// {{{ token definitions // variables: 'ident', 'sys_var' // values: 'real_val', 'text_val', 'int_val', null // }}}
/** * A lexigraphical analyser inspired by the msql lexer * * @author Brent Cook <busterbcook@yahoo.com> * @version 0.5 * @access public * @package SQL_Parser */ class Lexer { // array of valid tokens for the lexer to recognize // format is 'token literal'=>TOKEN_VALUE var $symbols = array();
// {{{ instance variables var $tokPtr = 0; var $tokStart = 0; var $tokLen = 0; var $tokText = ''; var $lineNo = 0; var $lineBegin = 0; var $string = ''; var $stringLen = 0; // Will not be altered by skip() var $tokAbsStart = 0; var $skipText = ''; // Provide lookahead capability. var $lookahead = 0; // Specify how many tokens to save in tokenStack, so the // token stream can be pushed back. var $tokenStack = array(); var $stackPtr = 0; // }}}
// {{{ incidental functions function Lexer($string = '', $lookahead=0) { $this->string = $string; $this->stringLen = strlen($string); $this->lookahead = $lookahead; } function get() { ++$this->tokPtr; ++$this->tokLen; return ($this->tokPtr <= $this->stringLen) ? $this->string{$this->tokPtr - 1} : null; }
function unget() { --$this->tokPtr; --$this->tokLen; }
function skip() { ++$this->tokStart; return ($this->tokPtr != $this->stringLen) ? $this->string{$this->tokPtr++} : ''; }
function revert() { $this->tokPtr = $this->tokStart; $this->tokLen = 0; }
function isCompop($c) { return (($c == '<') || ($c == '>') || ($c == '=') || ($c == '!')); } // }}}
// {{{ pushBack() /* * Push back a token, so the very next call to lex() will return that token. * Calls to this function will be ignored if there is no lookahead specified * to the constructor, or the pushBack() function has already been called the * maximum number of token's that can be looked ahead. */ function pushBack() { if($this->lookahead>0 && count($this->tokenStack)>0 && $this->stackPtr>0) { $this->stackPtr--; } } // }}}
// {{{ lex() function lex() { if($this->lookahead>0) { // The stackPtr, should always be the same as the count of // elements in the tokenStack. The stackPtr, can be thought // of as pointing to the next token to be added. If however // a pushBack() call is made, the stackPtr, will be less than the // count, to indicate that we should take that token from the // stack, instead of calling nextToken for a new token.
if ($this->stackPtr<count($this->tokenStack)) {
$this->tokText = $this->tokenStack[$this->stackPtr]['tokText']; $this->skipText = $this->tokenStack[$this->stackPtr]['skipText']; $token = $this->tokenStack[$this->stackPtr]['token']; // We have read the token, so now iterate again. $this->stackPtr++; return $token;
} else {
// If $tokenStack is full (equal to lookahead), pop the oldest // element off, to make room for the new one.
if ($this->stackPtr == $this->lookahead) { // For some reason array_shift and // array_pop screw up the indexing, so we do it manually. for($i=0; $i<(count($this->tokenStack)-1); $i++) { $this->tokenStack[$i] = $this->tokenStack[$i+1]; } // Indicate that we should put the element in // at the stackPtr position. $this->stackPtr--; } $token = $this->nextToken(); $this->tokenStack[$this->stackPtr] = array('token'=>$token, 'tokText'=>$this->tokText, 'skipText'=>$this->skipText); $this->stackPtr++; return $token; } } else { return $this->nextToken(); } } // }}}
// {{{ nextToken() function nextToken() { if ($this->string == '') return; $state = 0; $this->tokAbsStart = $this->tokStart; while (true){ //echo "State: $state, Char: $c\n"; switch($state) { // {{{ State 0 : Start of token case 0: $this->tokPtr = $this->tokStart; $this->tokText = ''; $this->tokLen = 0; $c = $this->get();
if (is_null($c)) { // End Of Input $state = 1000; break; }
while (($c == ' ') || ($c == "\t") || ($c == "\n") || ($c == "\r")) { if ($c == "\n" || $c == "\r") { // Handle MAC/Unix/Windows line endings. if($c == "\r") { $c = $this->skip(); // If not DOS newline if($c != "\n") $this->unget(); } ++$this->lineNo; $this->lineBegin = $this->tokPtr; } $c = $this->skip(); $this->tokLen = 1; } // Escape quotes and backslashes if ($c == '\\') { $t = $this->get(); if ($t == '\'' || $t == '\\' || $t == '"') { $this->tokText = $t; $this->tokStart = $this->tokPtr; return $this->tokText; } else { $this->unget(); // Unknown token. Revert to single char $state = 999; break; } } if (($c == '\'') || ($c == '"')) { // text string $quote = $c; $state = 12; break; }
if ($c == '_') { // system variable $state = 18; break; }
if (ctype_alpha(ord($c))) { // keyword or ident $state = 1; break; }
if (ctype_digit(ord($c))) { // real or int number $state = 5; break; }
if ($c == '.') { $t = $this->get(); if ($t == '.') { // ellipsis if ($this->get() == '.') { $this->tokText = '...'; $this->tokStart = $this->tokPtr; return $this->tokText; } else { $state = 999; break; } } else if (ctype_digit(ord($t))) { // real number $this->unget(); $state = 7; break; } else { // period $this->unget(); } }
if ($c == '#') { // Comments $state = 14; break; } if ($c == '-') { $t = $this->get(); if ($t == '-') { $state = 14; break; } else { // negative number $this->unget(); $state = 5; break; } }
if ($this->isCompop($c)) { // comparison operator $state = 10; break; } // Unknown token. Revert to single char $state = 999; break; // }}}
// {{{ State 1 : Incomplete keyword or ident case 1: $c = $this->get(); if (ctype_alnum(ord($c)) || ($c == '_') || ($c == '.')) { $state = 1; break; } $state = 2; break; // }}}
/* {{{ State 2 : Complete keyword or ident */ case 2: $this->unget(); $this->tokText = substr($this->string, $this->tokStart, $this->tokLen); $testToken = strtolower($this->tokText); if (isset($this->symbols[$testToken])) { $this->skipText = substr($this->string, $this->tokAbsStart, $this->tokStart-$this->tokAbsStart); $this->tokStart = $this->tokPtr; return $testToken; } else { $this->skipText = substr($this->string, $this->tokAbsStart, $this->tokStart-$this->tokAbsStart); $this->tokStart = $this->tokPtr; return 'ident'; } break; // }}}
// {{{ State 5: Incomplete real or int number case 5: $c = $this->get(); if (ctype_digit(ord($c))) { $state = 5; break; } else if ($c == '.') { $t = $this->get(); if($t == '.') { // ellipsis $this->unget(); } else { // real number $state = 7; break; } } else if(ctype_alpha(ord($c))) { // number must end with non-alpha character $state = 999; break; } else { // complete number $state = 6; break; } // }}}
// {{{ State 6: Complete integer number case 6: $this->unget(); $this->tokText = intval(substr($this->string, $this->tokStart, $this->tokLen)); $this->skipText = substr($this->string, $this->tokAbsStart, $this->tokStart-$this->tokAbsStart); $this->tokStart = $this->tokPtr; return 'int_val'; break; // }}}
// {{{ State 7: Incomplete real number case 7: $c = $this->get();
/* Analogy Start */ if ($c == 'e' || $c == 'E') { $state = 15; break; } /* Analogy End */
if (ctype_digit(ord($c))) { $state = 7; break; } $state = 8; break; // }}}
// {{{ State 8: Complete real number */ case 8: $this->unget(); $this->tokText = floatval(substr($this->string, $this->tokStart, $this->tokLen)); $this->skipText = substr($this->string, $this->tokAbsStart, $this->tokStart-$this->tokAbsStart); $this->tokStart = $this->tokPtr; return 'real_val'; // }}}
// {{{ State 10: Incomplete comparison operator case 10: $c = $this->get(); if ($this->isCompop($c)) { $state = 10; break; } $state = 11; break; // }}}
// {{{ State 11: Complete comparison operator case 11: $this->unget(); $this->tokText = substr($this->string, $this->tokStart, $this->tokLen); if($this->tokText) { $this->skipText = substr($this->string, $this->tokAbsStart, $this->tokStart-$this->tokAbsStart); $this->tokStart = $this->tokPtr; return $this->tokText; } $state = 999; break; // }}}
// {{{ State 12: Incomplete text string case 12: $bail = false; while (!$bail) { switch ($this->get()) { case '': $this->tokText = null; $bail = true; break; case "\\": if (!$this->get()) { $this->tokText = null; $bail = true; } //$bail = true; break; case $quote: $this->tokText = stripslashes(substr($this->string, ($this->tokStart+1), ($this->tokLen-2))); $bail = true; break; } } if (!is_null($this->tokText)) { $state = 13; break; } $state = 999; break; // }}}
// {{{ State 13: Complete text string case 13: $this->skipText = substr($this->string, $this->tokAbsStart, $this->tokStart-$this->tokAbsStart); $this->tokStart = $this->tokPtr; return 'text_val'; break; // }}}
// {{{ State 14: Comment case 14: $c = $this->skip(); if ($c == "\n" || $c == "\r" || $c == "") { // Handle MAC/Unix/Windows line endings. if ($c == "\r") { $c = $this->skip(); // If not DOS newline if ($c != "\n") { $this->unget(); } }
if ($c != "") { ++$this->lineNo; $this->lineBegin = $this->tokPtr; }
// We need to skip all the text. $this->tokStart = $this->tokPtr; $state = 0; } else { $state = 14; } break; // }}}
// {{{ State 15: Exponent Sign in Scientific Notation case 15: $c = $this->get(); if($c == '-' || $c == '+') { $state = 16; break; } $state = 999; break; // }}}
// {{{ state 16: Exponent Value-first digit in Scientific Notation case 16: $c = $this->get(); if (ctype_digit(ord($c))) { $state = 17; break; } $state = 999; // if no digit, then token is unknown break; // }}}
// {{{ State 17: Exponent Value in Scientific Notation case 17: $c = $this->get(); if (ctype_digit(ord($c))) { $state = 17; break; } $state = 8; // At least 1 exponent digit was required break; // }}}
// {{{ State 18 : Incomplete System Variable case 18: $c = $this->get(); if (ctype_alnum(ord($c)) || $c == '_') { $state = 18; break; } $state = 19; break; // }}}
// {{{ State 19: Complete Sys Var case 19: $this->unget(); $this->tokText = substr($this->string, $this->tokStart, $this->tokLen); $this->skipText = substr($this->string, $this->tokAbsStart, $this->tokStart-$this->tokAbsStart); $this->tokStart = $this->tokPtr; return 'sys_var'; // }}}
// {{{ State 999 : Unknown token. Revert to single char case 999: $this->revert(); $this->tokText = $this->get(); $this->skipText = substr($this->string, $this->tokAbsStart, $this->tokStart-$this->tokAbsStart); $this->tokStart = $this->tokPtr; return $this->tokText; // }}}
// {{{ State 1000 : End Of Input case 1000: $this->tokText = '*end of input*'; $this->skipText = substr($this->string, $this->tokAbsStart, $this->tokStart-$this->tokAbsStart); $this->tokStart = $this->tokPtr; return null; // }}} } } } // }}} } ?>
|