Standard Portable Library: src/interp/JsLex.cpp Source File

00001 /*
00002  *   This file is part of the Standard Portable Library (SPL).
00003  *
00004  *   SPL is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   SPL is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with SPL.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 #include <spl/interp/JsLex.h>
00018 #include <spl/io/StringStream.h>
00019 
00020 int JsLex::GetCh(  )
00021 {
00022         int ch;
00023         
00024         if (m_pushBack != -1)
00025         {
00026                 ch = m_pushBack;
00027                 m_pushBack = -1;
00028         }
00029         else 
00030         {
00031                 ch = m_text->ReadByte();
00032         }
00033         
00034         if ('\n' == ch)
00035         {
00036                 m_lineNo++;
00037         }
00038         
00039         return ch;
00040 }
00041 
00042 void JsLex::UnGetCh( const int ch )
00043 {
00044         ASSERT(m_pushBack == -1);
00045         
00046         if ('\n' == ch)
00047         {
00048                 m_lineNo--;
00049         }
00050         m_pushBack = ch;
00051 }
00052 
00053 int JsLex::GetEscape(  )
00054 {
00055         int ch = GetCh( );
00056         switch ( ch )
00057         {
00058         case 'n':
00059                 ch = '\n';
00060                 break;
00061         case 'r':
00062                 ch = '\r';
00063                 break;
00064         case 't':
00065                 ch = '\t';
00066                 break;
00067         case 'v':
00068                 ch = '\v';
00069                 break;
00070         case '\'':
00071                 ch = '\'';
00072                 break;
00073         case '\\':
00074                 ch = '\\';
00075                 break;
00076         default:
00077                 ch = -1;
00078         }
00079         return ch;
00080 }
00081 
00082 void JsLex::StripWS()
00083 {
00084         int ch = GetCh();
00085 
00086         while ( IsWs(ch) )
00087         {
00088                 ch = GetCh();
00089         }
00090         UnGetCh( ch );
00091 }
00092 
00093 JsLex::JsLex()
00094 :       m_text(),
00095         m_pushBack(-1),
00096         m_lineNo(1),
00097         m_lexum(),
00098         m_token(T_LEXERROR)
00099 {
00100 }
00101 
00102 JsLex::JsLex(const String& text)
00103 :       m_text(),
00104         m_lexum()
00105 {
00106         Init(text);
00107 }
00108 
00109 JsLex::JsLex(spl::IStreamPtr stream)
00110 :       m_text(),
00111         m_lexum()
00112 {
00113         Init(stream);
00114 }
00115 
00116 JsLex::JsLex(const JsLex& lex)
00117 :       m_text(lex.m_text),
00118         m_pushBack(lex.m_pushBack),
00119         m_lineNo(lex.m_lineNo),
00120         m_lexum(lex.m_lexum),
00121         m_token(lex.m_token)
00122 {
00123 }
00124 
00125 JsLex::~JsLex()
00126 {
00127 }
00128 
00129 JsLex& JsLex::operator =(const JsLex& lex)
00130 {
00131         m_text = lex.m_text;
00132         m_pushBack = lex.m_pushBack;
00133         m_lineNo = lex.m_lineNo;
00134         m_lexum = lex.m_lexum;
00135         m_token = lex.m_token;
00136         
00137         return *this;
00138 }
00139 
00140 void JsLex::Init(const String& text)
00141 {
00142         Init(StringStreamPtr(new StringStream(text)));
00143 }
00144 
00145 void JsLex::Init(spl::IStreamPtr text)
00146 {
00147         m_text = text;
00148         m_pushBack = -1;
00149         m_lineNo = 1;
00150         m_lexum.Clear();
00151         m_token = T_LEXERROR;
00152         
00153         int ch = GetCh();
00154         if ('#' == ch)
00155         {
00156                 //shebang support (#! /usr/local/bin/js)
00157                 while ( '\r' != ch && '\n' != ch )
00158                 {
00159                         ch = GetCh();
00160                 }
00161         }
00162         UnGetCh( ch );
00163 }
00164 
00165 JsLex::Token JsLex::Next()
00166 {
00167         m_lexum.Clear();
00168         
00169         StripWS();
00170 
00171         int ch = GetCh();
00172         if ( ch <= 0 )
00173         {
00174                 m_token = T_JEOF;
00175                 return m_token;
00176         }
00177         if ( isdigit( ch ) || ch == '.' )
00178         {
00179                 Token toke = T_INT;
00180 
00181                 if ( ch == '.' )
00182                 {
00183                         ch = GetCh();
00184                         if ( ! isdigit( ch ) )
00185                         {
00186                                 UnGetCh( ch );
00187                                 m_token = T_DOT;
00188                                 return m_token;
00189                         }
00190                         toke = T_REAL;
00191                         UnGetCh( ch );
00192                         ch = '.';
00193                 }
00194 
00195                 /*
00196                  * read some sort of number
00197                  */
00198                 while ( isdigit(ch) )
00199                 {
00200                         m_lexum.Append((char)ch);
00201                         ch = GetCh();
00202                 }
00203                 if ( ch == 'x' )
00204                 {
00205                         if ( m_lexum.Length() != 1 )
00206                         {
00207                                 m_lexum.Set("Invalid number format.");
00208                                 m_token = T_LEXERROR;
00209                                 return m_token;
00210                         }
00211                         if ( m_lexum.CharAt(0) != '0' )
00212                         {
00213                                 m_lexum.Set("Invalid number format.");
00214                                 m_token = T_LEXERROR;
00215                                 return m_token;
00216                         }
00217                         m_lexum.Append((char)ch);
00218                         ch = GetCh();
00219                         while ( isdigit(ch) )
00220                         {
00221                                 m_lexum.Append((char)ch);
00222                                 ch = GetCh();
00223                         }
00224                         UnGetCh(ch);
00225                         m_token = T_HEX;
00226                         return m_token;
00227                 }
00228                 if ( ch == '.' )
00229                 {
00230                         toke = T_REAL;
00231 
00232                         m_lexum.Append((char)ch);
00233                         ch = GetCh();
00234                         while ( isdigit(ch) )
00235                         {
00236                                 m_lexum.Append((char)ch);
00237                                 ch = GetCh();
00238                         }
00239                 }
00240                 if ( ch == 'e' || ch == 'E' )
00241                 {
00242                         toke = T_REAL;
00243                         m_lexum.Append((char)ch);
00244                         ch = GetCh();
00245                         if ( ch == '+' || ch == '-' )
00246                         {
00247                                 m_lexum.Append((char)ch);
00248                                 ch = GetCh();
00249                         }
00250                         while ( isdigit(ch) )
00251                         {
00252                                 m_lexum.Append((char)ch);
00253                                 ch = GetCh();
00254                         }
00255                 }
00256                 UnGetCh(ch);
00257                 m_token = toke;
00258                 return m_token;
00259         }
00260         else if ( isalpha( ch ) || ch == '_' )
00261         {
00262                 /*
00263                  * identifier
00264                  */
00265                 while ( isalnum( ch ) || ch == '_' )
00266                 {
00267                         m_lexum.Append((char)ch);
00268                         ch = GetCh();
00269                 }
00270                 UnGetCh(ch);
00271                 if ( m_lexum.Equals("var") )
00272                 {
00273                         m_token = T_VAR;
00274                         return m_token;
00275                 }
00276                 if ( m_lexum.Equals("if") )
00277                 {
00278                         m_token = T_IF;
00279                         return m_token;
00280                 }
00281                 if ( m_lexum.Equals("else") )
00282                 {
00283                         m_token = T_ELSE;
00284                         return m_token;
00285                 }
00286                 if ( m_lexum.Equals("return") )
00287                 {
00288                         m_token = T_RETURN;
00289                         return m_token;
00290                 }
00291                 if ( m_lexum.Equals("while") )
00292                 {
00293                         m_token = T_WHILE;
00294                         return m_token;
00295                 }
00296                 if ( m_lexum.Equals("do") )
00297                 {
00298                         m_token = T_DO;
00299                         return m_token;
00300                 }
00301                 if ( m_lexum.Equals("for") )
00302                 {
00303                         m_token = T_FOR;
00304                         return m_token;
00305                 }
00306                 if ( m_lexum.Equals("break") )
00307                 {
00308                         m_token = T_BREAK;
00309                         return m_token;
00310                 }
00311                 if ( m_lexum.Equals("static") )
00312                 {
00313                         m_token = T_STATIC;
00314                         return m_token;
00315                 }
00316                 if ( m_lexum.Equals("class") )
00317                 {
00318                         m_token = T_CLASS;
00319                         return m_token;
00320                 }
00321                 if ( m_lexum.Equals("public") )
00322                 {
00323                         m_token = T_PUBLIC;
00324                         return m_token;
00325                 }
00326                 if ( m_lexum.Equals("private") )
00327                 {
00328                         m_token = T_PRIVATE;
00329                         return m_token;
00330                 }
00331                 if ( m_lexum.Equals("protected") )
00332                 {
00333                         m_token = T_PROTECTED;
00334                         return m_token;
00335                 }
00336                 if ( m_lexum.Equals("continue") )
00337                 {
00338                         m_token = T_CONTINUE;
00339                         return m_token;
00340                 }
00341                 if ( m_lexum.Equals("switch") )
00342                 {
00343                         m_token = T_SWITCH;
00344                         return m_token;
00345                 }
00346                 if ( m_lexum.Equals("case") )
00347                 {
00348                         m_token = T_CASE;
00349                         return m_token;
00350                 }
00351                 if ( m_lexum.Equals("default") )
00352                 {
00353                         m_token = T_DEFAULT;
00354                         return m_token;
00355                 }
00356                 if ( m_lexum.Equals("new") )
00357                 {
00358                         m_token = T_NEW;
00359                         return m_token;
00360                 }
00361                 if ( m_lexum.Equals("delete") )
00362                 {
00363                         m_token = T_DELETE;
00364                         return m_token;
00365                 }
00366                 if ( m_lexum.Equals("const") )
00367                 {
00368                         m_token = T_CONST;
00369                         return m_token;
00370                 }
00371                 if ( m_lexum.Equals("null") )
00372                 {
00373                         m_token = T_JNULL;
00374                         return m_token;
00375                 }
00376                 if ( m_lexum.Equals("try") )
00377                 {
00378                         m_token = T_TRY;
00379                         return m_token;
00380                 }
00381                 if ( m_lexum.Equals("catch") )
00382                 {
00383                         m_token = T_CATCH;
00384                         return m_token;
00385                 }
00386                 if ( m_lexum.Equals("finally") )
00387                 {
00388                         m_token = T_FINALLY;
00389                         return m_token;
00390                 }
00391                 if ( m_lexum.Equals("throw") )
00392                 {
00393                         m_token = T_THROW;
00394                         return m_token;
00395                 }
00396                 if ( m_lexum.Equals("true") )
00397                 {
00398                         m_token = T_YYTRUE;
00399                         return m_token;
00400                 }
00401                 if ( m_lexum.Equals("false") )
00402                 {
00403                         m_token = T_YYFALSE;
00404                         return m_token;
00405                 }
00406                 m_token = T_ID;
00407                 return m_token;
00408         }
00409         switch ( ch )
00410         {
00411         case '\'':
00412                 /*
00413                  * char
00414                  */
00415                 ch = GetCh();
00416                 if ( ch == '\\' )
00417                 {
00418                         ch = GetEscape();
00419                         if ( -1 == ch )
00420                         {
00421                                 m_lexum.Set("Unexpected escape");
00422                                 m_token = T_LEXERROR;
00423                                 return m_token;
00424                         }
00425                 }
00426                 m_lexum.Append((char)ch);
00427                 ch = GetCh();
00428                 if ( ch != '\'' )
00429                 {
00430                         m_lexum.Set("Unterminated character");
00431                         m_token = T_LEXERROR;
00432                         return m_token;
00433                 }
00434                 m_token = T_CHAR;
00435                 return m_token;
00436 
00437         case '"':
00438                 /*
00439                  * string
00440                  */
00441                 while ( (ch = GetCh()) != -1 && ch != '"' )
00442                 {
00443                         if ( ch == '\\' )
00444                         {
00445                                 ch = GetEscape();
00446                                 if ( -1 == ch )
00447                                 {
00448                                         m_lexum.Set("Unexpected escape");
00449                                         m_token = T_LEXERROR;
00450                                         return m_token;
00451                                 }
00452                         }
00453                         m_lexum.Append((char)ch);
00454                 }
00455                 if ( ch == -1 )
00456                 {
00457                         m_lexum.Set("Unterminated escape");
00458                         m_token = T_LEXERROR;
00459                         return m_token;
00460                 }
00461                 m_token = T_STRING;
00462                 return m_token;
00463 
00464         case '|':
00465                 /*
00466                  * or
00467                  */
00468                 ch = GetCh();
00469                 if ( '|' == ch )
00470                 {
00471                         m_token = T_OR;
00472                         return m_token;
00473                 }
00474                 if ( '=' == ch )
00475                 {
00476                         m_token = T_OREQ;
00477                         return m_token;
00478                 }
00479                 UnGetCh(ch);
00480                 m_token = T_PIPE;
00481                 return m_token;
00482 
00483         case '&':
00484                 /*
00485                  * and
00486                  */
00487                 ch = GetCh();
00488                 if ( '&' == ch )
00489                 {
00490                         m_token = T_AND;
00491                         return m_token;
00492                 }
00493                 if ( '=' == ch )
00494                 {
00495                         m_token = T_ANDEQ;
00496                         return m_token;
00497                 }
00498                 UnGetCh(ch);
00499                 m_token = T_AMPR;
00500                 return m_token;
00501 
00502         case '{':
00503                 m_token = T_LBRACE;
00504                 return m_token;
00505 
00506         case '}':
00507                 m_token = T_RBRACE;
00508                 return m_token;
00509 
00510         case '(':
00511                 m_token = T_LPAR;
00512                 return m_token;
00513 
00514         case ')':
00515                 m_token = T_RPAR;
00516                 return m_token;
00517 
00518         case '[':
00519                 m_token = T_LBRAC;
00520                 return m_token;
00521 
00522         case ']':
00523                 m_token = T_RBRAC;
00524                 return m_token;
00525 
00526         case '!':
00527                 ch = GetCh();
00528                 if ( '=' == ch )
00529                 {
00530                         m_token = T_ISNEQ;
00531                         return m_token;
00532                 }
00533                 UnGetCh(ch);
00534                 m_token = T_BANG;
00535                 return m_token;
00536 
00537         case '=':
00538                 ch = GetCh();
00539                 if ( '=' == ch )
00540                 {
00541                         m_token = T_ISEQUAL;
00542                         return m_token;
00543                 }
00544                 UnGetCh(ch);
00545                 m_token = T_ASSIGN;
00546                 return m_token;
00547 
00548         case '+':
00549                 ch = GetCh();
00550                 if ( '=' == ch )
00551                 {
00552                         m_token = T_PLUSEQ;
00553                         return m_token;
00554                 }
00555                 if ( '+' == ch )
00556                 {
00557                         m_token = T_INC;
00558                         return m_token;
00559                 }
00560                 UnGetCh( ch );
00561                 m_token = T_PLUS;
00562                 return m_token;
00563 
00564         case '-':
00565                 ch = GetCh();
00566                 if ( '=' == ch )
00567                 {
00568                         m_token = T_MINUSEQ;
00569                         return m_token;
00570                 }
00571                 if ( '-' == ch )
00572                 {
00573                         m_token = T_DEC;
00574                         return m_token;
00575                 }
00576                 UnGetCh( ch );
00577                 m_token = T_MINUS;
00578                 return m_token;
00579 
00580         case '*':
00581                 ch = GetCh();
00582                 if ( '=' == ch )
00583                 {
00584                         m_token = T_TIMESEQ;
00585                         return m_token;
00586                 }
00587                 UnGetCh(ch);
00588                 m_token = T_STAR;
00589                 return m_token;
00590 
00591         case '/':
00592                 ch = GetCh();
00593                 if ( '=' == ch )
00594                 {
00595                         m_token = T_DIVEQ;
00596                         return m_token;
00597                 }
00598                 if ( '/' == ch )
00599                 {
00600                         /*
00601                          * EOL comment
00602                          */
00603                         while ( ch != '\n' && ch > 0 )
00604                         {
00605                                 ch = GetCh();
00606                         }
00607                         return Next();
00608                 }
00609                 if ( '*' == ch )
00610                 {
00611                         /*
00612                          *  Mult-line comment
00613                          */
00614                         int opencount = 1;
00615                         while ( opencount > 0 && ch > 0 )
00616                         {
00617                                 while ( ch != '*' && ch != '/' && ch > 0 )
00618                                 {
00619                                         ch = GetCh();
00620                                 }
00621                                 if ( ch == '*' )
00622                                 {
00623                                         ch = GetCh();
00624                                         if ( ch == '/' )
00625                                         {
00626                                                 opencount--;
00627                                         }
00628                                 }
00629                                 if ( ch == '/' )
00630                                 {
00631                                         ch = GetCh();
00632                                         if ( ch == '*' )
00633                                         {
00634                                                 opencount++;
00635                                         }
00636                                 }
00637                         }
00638                         return Next();
00639                 }
00640                 UnGetCh(ch);
00641                 m_token = T_SLASH;
00642                 return m_token;
00643 
00644         case '%':
00645                 ch = GetCh();
00646                 if ( '=' == ch )
00647                 {
00648                         m_token = T_MODEQ;
00649                         return m_token;
00650                 }
00651                 UnGetCh(ch);
00652                 m_token = T_MOD;
00653                 return m_token;
00654         
00655         case '^':
00656                 ch = GetCh();
00657                 if ( '=' == ch )
00658                 {
00659                         m_token = T_XOREQ;
00660                         return m_token;
00661                 }
00662                 UnGetCh(ch);
00663                 m_token = T_XOR;
00664                 return m_token;
00665         
00666         case '~':
00667                 ch = GetCh();
00668                 if ( '=' == ch )
00669                 {
00670                         m_token = T_COMPEQ;
00671                         return m_token;
00672                 }
00673                 UnGetCh(ch);
00674                 m_token = T_COMP;
00675                 return m_token;
00676 
00677         case ':':
00678                 ch = GetCh();
00679                 if ( ':' == ch )
00680                 {
00681                         m_token = T_SCOPE;
00682                         return m_token;
00683                 }
00684                 UnGetCh(ch);
00685                 m_token = T_COLON;
00686                 return m_token;
00687 
00688         case ';':
00689                 m_token = T_SEMI;
00690                 return m_token;
00691 
00692         case ',':
00693                 m_token = T_COMMA;
00694                 return m_token;
00695 
00696         case '<':
00697                 ch = GetCh();
00698                 if ( '=' == ch )
00699                 {
00700                         m_token = T_LTEQ;
00701                         return m_token;
00702                 }
00703                 else if ( '<' == ch )
00704                 {
00705                         m_token = T_LSHIFT;
00706                         return m_token;
00707                 }
00708                 UnGetCh(ch);
00709                 m_token = T_LT;
00710                 return m_token;
00711 
00712         case '>':
00713                 ch = GetCh();
00714                 if ( '=' == ch )
00715                 {
00716                         m_token = T_GTEQ;
00717                         return m_token;
00718                 }
00719                 else if ( '>' == ch )
00720                 {
00721                         m_token = T_RSHIFT;
00722                         return m_token;
00723                 }
00724                 UnGetCh(ch);
00725                 m_token = T_GT;
00726                 return m_token;
00727 
00728         case -1:
00729                 m_token = T_JEOF;
00730                 return m_token;
00731 
00732         default:
00733                 m_lexum.Set("Internal lexer error");
00734                 m_token = T_LEXERROR;
00735                 return m_token;
00736         }
00737 #ifdef _WINDOWS
00738         m_token = T_LEXERROR;           /* make certian compilers happy */
00739         return m_token;
00740 #endif
00741 }
00742 
00743 #if defined(DEBUG) || defined(_DEBUG)
00744 void JsLex::CheckMem() const
00745 {
00746         m_text.CheckMem();
00747         m_lexum.CheckMem();
00748 }
00749 
00750 void JsLex::ValidateMem() const
00751 {
00752         m_text.ValidateMem();
00753         m_lexum.ValidateMem();
00754 }
00755 #endif