token.cc Source File

00001 /*
00002   Copyright (C) 1997-2001 Shigeru Chiba, Tokyo Institute of Technology.
00003 
00004   Permission to use, copy, distribute and modify this software and   
00005   its documentation for any purpose is hereby granted without fee,        
00006   provided that the above copyright notice appear in all copies and that 
00007   both that copyright notice and this permission notice appear in 
00008   supporting documentation.
00009 
00010   Shigeru Chiba makes no representations about the suitability of this 
00011   software for any purpose.  It is provided "as is" without express or
00012   implied warranty.
00013 */
00014 
00015 /*
00016   Copyright (c) 1995, 1996 Xerox Corporation.
00017   All Rights Reserved.
00018 
00019   Use and copying of this software and preparation of derivative works
00020   based upon this software are permitted. Any copy of this software or
00021   of any derivative work must include the above copyright notice of
00022   Xerox Corporation, this paragraph and the one after it.  Any
00023   distribution of this software or derivative works must comply with all
00024   applicable United States export control laws.
00025 
00026   This software is made available AS IS, and XEROX CORPORATION DISCLAIMS
00027   ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE
00028   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00029   PURPOSE, AND NOTWITHSTANDING ANY OTHER PROVISION CONTAINED HEREIN, ANY
00030   LIABILITY FOR DAMAGES RESULTING FROM THE SOFTWARE OR ITS USE IS
00031   EXPRESSLY DISCLAIMED, WHETHER ARISING IN CONTRACT, TORT (INCLUDING
00032   NEGLIGENCE) OR STRICT LIABILITY, EVEN IF XEROX CORPORATION IS ADVISED
00033   OF THE POSSIBILITY OF SUCH DAMAGES.
00034 */
00035 
00036 #include <stdlib.h>
00037 #include <string.h>
00038 #include "token.h"
00039 #include "hash.h"
00040 #include "ptree.h"
00041 #include "buffer.h"
00042 
00043 #if defined(_PARSE_VCC)
00044 #define _MSC_VER        1100
00045 #endif
00046 
00047 #if defined(_MSC_VER)
00048 #include <assert.h>             // for assert in InitializeOtherKeywords
00049 #endif
00050 
00051 extern BOOL regularCpp;         // defined in main.cc
00052 static void InitializeOtherKeywords();
00053 
00054 #ifdef TEST
00055 
00056 #ifdef __GNUG__
00057 #define token(x)        (long)#x
00058 #else
00059 #define token(x)        (long)"x"
00060 #endif
00061 
00062 #else
00063 
00064 #define token(x)        x
00065 
00066 #endif
00067 
00068 // class Lex
00069 
00070 HashTable* Lex::user_keywords = nil;
00071 Ptree* Lex::comments = nil;
00072 
00073 Lex::Lex(Program* prog) : fifo(this)
00074 {
00075     file = prog;
00076     prog->Rewind();
00077     last_token = '\n';
00078     tokenp = 0;
00079     token_len = 0;
00080 
00081     InitializeOtherKeywords();
00082 }
00083 
00084 char* Lex::Save()
00085 {
00086     char* pos;
00087     int len;
00088 
00089     fifo.Peek(0, pos, len);
00090     return pos;
00091 }
00092 
00093 void Lex::Restore(char* pos)
00094 {
00095     last_token = '\n';
00096     tokenp = 0;
00097     token_len = 0;
00098     fifo.Clear();
00099     Rewind(pos);
00100 }
00101 
00102 // ">>" is either the shift operator or double closing brackets.
00103 
00104 void Lex::GetOnlyClosingBracket(Token& t)
00105 {
00106     Restore(t.ptr + 1);
00107 }
00108 
00109 uint Lex::LineNumber(char* pos, char*& ptr, int& len)
00110 {
00111     return file->LineNumber(pos, ptr, len);
00112 }
00113 
00114 int Lex::GetToken(Token& t)
00115 {
00116     t.kind = fifo.Pop(t.ptr, t.len);
00117     return t.kind;
00118 }
00119 
00120 int Lex::LookAhead(int offset)
00121 {
00122     return fifo.Peek(offset);
00123 }
00124 
00125 int Lex::LookAhead(int offset, Token& t)
00126 {
00127     t.kind = fifo.Peek(offset, t.ptr, t.len);
00128     return t.kind;
00129 }
00130 
00131 char* Lex::TokenPosition()
00132 {
00133     return (char*)file->Read(Tokenp());
00134 }
00135 
00136 char Lex::Ref(uint i)
00137 {
00138     return file->Ref(i);
00139 }
00140 
00141 void Lex::Rewind(char* p)
00142 {
00143     file->Rewind(p - file->Read(0));
00144 }
00145 
00146 bool Lex::RecordKeyword(char* keyword, int token)
00147 {
00148     int index;
00149     char* str;
00150 
00151     if(keyword == nil)
00152         return FALSE;
00153 
00154     str = new(GC) char[strlen(keyword) + 1];
00155     strcpy(str, keyword);
00156 
00157     if(user_keywords == nil)
00158         user_keywords = new HashTable;
00159 
00160     if(user_keywords->AddEntry(str, (HashValue)token, &index) >= 0)
00161         return TRUE;
00162     else
00163         return bool(user_keywords->Peek(index) == (HashValue)token);
00164 }
00165 
00166 bool Lex::Reify(Ptree* t, unsigned int& value)
00167 {
00168     if(t == nil || !t->IsLeaf())
00169         return FALSE;
00170 
00171     char* p = t->GetPosition();
00172     int len = t->GetLength();
00173     value = 0;
00174     if(len > 2 && *p == '0' && is_xletter(p[1])){
00175         for(int i = 2; i < len; ++i){
00176             char c = p[i];
00177             if(is_digit(c))
00178                 value = value * 0x10 + (c - '0');
00179             else if('A' <= c && c <= 'F')
00180                 value = value * 0x10 + (c - 'A' + 10);
00181             else if('a' <= c && c <= 'f')
00182                 value = value * 0x10 + (c - 'a' + 10);
00183             else if(is_int_suffix(c))
00184                 break;
00185             else
00186                 return FALSE;
00187         }
00188 
00189         return TRUE;
00190     }
00191     else if(len > 0 && is_digit(*p)){
00192         for(int i = 0; i < len; ++i){
00193             char c = p[i];
00194             if(is_digit(c))
00195                 value = value * 10 + c - '0';
00196             else if(is_int_suffix(c))
00197                 break;
00198             else
00199                 return FALSE;
00200         }
00201 
00202         return TRUE;
00203     }
00204     else
00205         return FALSE;
00206 }
00207 
00208 // Reify() doesn't interpret an escape character.
00209 
00210 bool Lex::Reify(Ptree* t, char*& str)
00211 {
00212     if(t == nil || !t->IsLeaf())
00213         return FALSE;
00214 
00215     char* p = t->GetPosition();
00216     int length = t->GetLength();
00217     if(*p != '"')
00218         return FALSE;
00219     else{
00220         str = new(GC) char[length];
00221         char* sp = str;
00222         for(int i = 1; i < length; ++i)
00223             if(p[i] != '"'){
00224                 *sp++ = p[i];
00225                 if(p[i] == '\\' && i + 1 < length)
00226                     *sp++ = p[++i];
00227             }
00228             else
00229                 while(++i < length && p[i] != '"')
00230                     ;
00231 
00232         *sp = '\0';
00233         return TRUE;
00234     }
00235 }
00236 
00237 // class TokenFifo
00238 
00239 Lex::TokenFifo::TokenFifo(Lex* l)
00240 {
00241     lex = l;
00242     size = 16;
00243     ring = new (GC) Slot[size];
00244     head = tail = 0;
00245 }
00246 
00247 Lex::TokenFifo::~TokenFifo()
00248 {
00249     // delete [] ring;
00250 }
00251 
00252 void Lex::TokenFifo::Clear()
00253 {
00254     head = tail = 0;
00255 }
00256 
00257 void Lex::TokenFifo::Push(int token, char* pos, int len)
00258 {
00259     const int Plus = 16;
00260     ring[head].token = token;
00261     ring[head].pos = pos;
00262     ring[head].len = len;
00263     head = (head + 1) % size;
00264     if(head == tail){
00265         Slot* ring2 = new (GC) Slot[size + Plus];
00266         int i = 0;
00267         do{
00268             ring2[i++] = ring[tail];
00269             tail = (tail + 1) % size;
00270         } while(head != tail);
00271         head = i;
00272         tail = 0;
00273         size += Plus;
00274         // delete [] ring;
00275         ring = ring2;
00276     }
00277 }
00278 
00279 int Lex::TokenFifo::Pop(char*& pos, int& len)
00280 {
00281     if(head == tail)
00282         return lex->ReadToken(pos, len);
00283 
00284     int t = ring[tail].token;
00285     pos = ring[tail].pos;
00286     len = ring[tail].len;
00287     tail = (tail + 1) % size;
00288     return t;
00289 }
00290 
00291 int Lex::TokenFifo::Peek(int offset)
00292 {
00293     return ring[Peek2(offset)].token;
00294 }
00295 
00296 int Lex::TokenFifo::Peek(int offset, char*& pos, int& len)
00297 {
00298     int cur = Peek2(offset);
00299     pos = ring[cur].pos;
00300     len = ring[cur].len;
00301     return ring[cur].token;
00302 }
00303 
00304 int Lex::TokenFifo::Peek2(int offset)
00305 {
00306     int i;
00307     int cur = tail;
00308 
00309     for(i = 0; i <= offset; ++i){
00310         if(head == cur){
00311             while(i++ <= offset){
00312                 char* p;
00313                 int   l;
00314                 int t = lex->ReadToken(p, l);
00315                 Push(t, p, l);
00316             }
00317 
00318             break;
00319         }
00320 
00321         cur = (cur + 1) % size;
00322     }
00323 
00324     return (tail + offset) % size;
00325 }
00326 
00327 /*
00328   Lexical Analyzer
00329 */
00330 
00331 int Lex::ReadToken(char*& ptr, int& len)
00332 {
00333     int t;
00334 
00335     for(;;){
00336         t = ReadLine();
00337 
00338         if(t == Ignore)
00339             continue;
00340 
00341         last_token = t;
00342 
00343 #if defined(__GNUG__) || defined(_GNUG_SYNTAX)
00344         if(t == ATTRIBUTE){
00345             SkipAttributeToken();
00346             continue;
00347         }
00348         else if(t == EXTENSION){
00349             t = SkipExtensionToken(ptr, len);
00350             if(t == Ignore)
00351                 continue;
00352             else
00353                 return t;
00354         }
00355 #endif
00356 #if defined(_MSC_VER)
00357         if(t == ASM){
00358             SkipAsmToken();
00359             continue;
00360         }
00361         else if(t == DECLSPEC){
00362             SkipDeclspecToken();
00363             continue;
00364         }
00365 #endif
00366         if(t != '\n')
00367             break;
00368     }
00369 
00370     ptr = TokenPosition();
00371     len = TokenLen();
00372     return t;
00373 }
00374 
00375 //   SkipAttributeToken() skips __attribute__(...), ___asm__(...), ...
00376 
00377 void Lex::SkipAttributeToken()
00378 {
00379     char c;
00380 
00381     do{
00382         c = file->Get();
00383     }while(c != '(' && c != '\0');
00384 
00385     int i = 1;
00386     do{
00387         c = file->Get();
00388         if(c == '(')
00389             ++i;
00390         else if(c == ')')
00391             --i;
00392         else if(c == '\0')
00393             break;
00394     } while(i > 0);
00395 }
00396 
00397 // SkipExtensionToken() skips __extension__(...).
00398 
00399 int Lex::SkipExtensionToken(char*& ptr, int& len)
00400 {
00401     ptr = TokenPosition();
00402     len = TokenLen();
00403 
00404     char c;
00405 
00406     do{
00407         c = file->Get();
00408     }while(is_blank(c) || c == '\n');
00409 
00410     if(c != '('){
00411         file->Unget();
00412         return Ignore;          // if no (..) follows, ignore __extension__
00413     }
00414 
00415     int i = 1;
00416     do{
00417         c = file->Get();
00418         if(c == '(')
00419             ++i;
00420         else if(c == ')')
00421             --i;
00422         else if(c == '\0')
00423             break;
00424     } while(i > 0);
00425 
00426     return Identifier;  // regards it as the identifier __extension__
00427 }
00428 
00429 #if defined(_MSC_VER)
00430 
00431 #define CHECK_END_OF_INSTRUCTION(C, EOI) \
00432         if (C == '\0') return; \
00433         if (strchr(EOI, C)) { \
00434             this->file->Unget(); \
00435             return; \
00436         }
00437 
00438 /* SkipAsmToken() skips __asm ...
00439    You can have the following :
00440 
00441    Just count the '{' and '}' and it should be ok
00442    __asm { mov ax,1
00443            mov bx,1 }
00444 
00445    Stop when EOL found. Note that the first ';' after
00446    an __asm instruction is an ASM comment !
00447    int v; __asm mov ax,1 __asm mov bx,1; v=1;
00448 
00449    Stop when '}' found
00450    if (cond) {__asm mov ax,1 __asm mov bx,1}
00451 
00452    and certainly more...
00453 */
00454 void Lex::SkipAsmToken()
00455 {
00456     char c;
00457 
00458     do{
00459         c = file->Get();
00460         CHECK_END_OF_INSTRUCTION(c, "");
00461     }while(is_blank(c) || c == '\n');
00462 
00463     if(c == '{'){
00464         int i = 1;
00465         do{
00466             c = file->Get();
00467             CHECK_END_OF_INSTRUCTION(c, "");
00468             if(c == '{')
00469                 ++i;
00470             else if(c == '}')
00471                 --i;
00472         } while(i > 0);
00473     }
00474     else{
00475         for(;;){
00476             CHECK_END_OF_INSTRUCTION(c, "}\n");
00477             c = file->Get();
00478         }
00479     }
00480 }
00481 
00482 //   SkipDeclspecToken() skips __declspec(...).
00483 
00484 void Lex::SkipDeclspecToken()
00485 {
00486     char c;
00487 
00488     do{
00489         c = file->Get();
00490         CHECK_END_OF_INSTRUCTION(c, "");
00491     }while(is_blank(c));
00492 
00493     if (c == '(') {
00494         int i = 1;
00495         do{
00496             c = file->Get();
00497             CHECK_END_OF_INSTRUCTION(c, "};");
00498             if(c == '(')
00499                 ++i;
00500             else if(c == ')')
00501                 --i;
00502         }while(i > 0);
00503     }
00504 }
00505 
00506 #undef CHECK_END_OF_INSTRUCTION
00507 
00508 #endif /* _MSC_VER */
00509 
00510 char Lex::GetNextNonWhiteChar()
00511 {
00512     char c;
00513 
00514     for(;;){
00515         do{
00516             c = file->Get();
00517         }while(is_blank(c));
00518 
00519         if(c != '\\')
00520             break;
00521 
00522         c = file->Get();
00523         if(c != '\n' && c!= '\r') {
00524             file->Unget();
00525             break;
00526         }
00527     }
00528 
00529     return c;
00530 }
00531 
00532 int Lex::ReadLine()
00533 {
00534     char c;
00535     uint top;
00536 
00537     c = GetNextNonWhiteChar();
00538 
00539     tokenp = top = file->GetCurPos();
00540     if(c == '\0'){
00541         file->Unget();
00542         return '\0';
00543     }
00544     else if(c == '\n')
00545         return '\n';
00546     else if(c == '#' && last_token == '\n'){
00547         if(ReadLineDirective())
00548             return '\n';
00549         else{
00550             file->Rewind(top + 1);
00551             token_len = 1;
00552             return SingleCharOp(c);
00553         }
00554     }
00555     else if(c == '\'' || c == '"'){
00556         if(c == '\''){
00557             if(ReadCharConst(top))
00558                 return token(CharConst);
00559         }
00560         else{
00561             if(ReadStrConst(top))
00562                 return token(StringL);
00563         }
00564 
00565         file->Rewind(top + 1);
00566         token_len = 1;
00567         return SingleCharOp(c);
00568     }
00569     else if(is_digit(c))
00570         return ReadNumber(c, top);
00571     else if(c == '.'){
00572         c = file->Get();
00573         if(is_digit(c))
00574             return ReadFloat(top);
00575         else{
00576             file->Unget();
00577             return ReadSeparator('.', top);
00578         }
00579     }
00580     else if(is_letter(c))
00581         return ReadIdentifier(top);
00582     else
00583         return ReadSeparator(c, top);
00584 }
00585 
00586 bool Lex::ReadCharConst(uint top)
00587 {
00588     char c;
00589 
00590     for(;;){
00591         c = file->Get();
00592         if(c == '\\'){
00593             c = file->Get();
00594             if(c == '\0')
00595                 return FALSE;
00596         }
00597         else if(c == '\''){
00598             token_len = int(file->GetCurPos() - top + 1);
00599             return TRUE;
00600         }
00601         else if(c == '\n' || c == '\0')
00602             return FALSE;
00603     }
00604 }
00605 
00606 /*
00607   If text is a sequence of string constants like:
00608         "string1" "string2"
00609   then the string constants are delt with as a single constant.
00610 */
00611 bool Lex::ReadStrConst(uint top)
00612 {
00613     char c;
00614 
00615     for(;;){
00616         c = file->Get();
00617         if(c == '\\'){
00618             c = file->Get();
00619             if(c == '\0')
00620                 return FALSE;
00621         }
00622         else if(c == '"'){
00623             uint pos = file->GetCurPos() + 1;
00624             int nline = 0;
00625             do{
00626                 c = file->Get();
00627                 if(c == '\n')
00628                     ++nline;
00629             } while(is_blank(c) || c == '\n');
00630 
00631             if(c == '"')
00632                 /* line_number += nline; */ ;
00633             else{
00634                 token_len = int(pos - top);
00635                 file->Rewind(pos);
00636                 return TRUE;
00637             }
00638         }
00639         else if(c == '\n' || c == '\0')
00640             return FALSE;
00641     }
00642 }
00643 
00644 int Lex::ReadNumber(char c, uint top)
00645 {
00646     char c2 = file->Get();
00647 
00648     if(c == '0' && is_xletter(c2)){
00649         do{
00650             c = file->Get();
00651         } while(is_hexdigit(c));
00652         while(is_int_suffix(c))
00653             c = file->Get();
00654 
00655         file->Unget();
00656         token_len = int(file->GetCurPos() - top + 1);
00657         return token(Constant);
00658     }
00659 
00660     while(is_digit(c2))
00661         c2 = file->Get();
00662 
00663     if(is_int_suffix(c2))
00664         do{
00665             c2 = file->Get();
00666         }while(is_int_suffix(c2));
00667     else if(c2 == '.')
00668         return ReadFloat(top);
00669     else if(is_eletter(c2)){
00670         file->Unget();
00671         return ReadFloat(top);
00672     }
00673 
00674     file->Unget();
00675     token_len = int(file->GetCurPos() - top + 1);
00676     return token(Constant);
00677 }
00678 
00679 int Lex::ReadFloat(uint top)
00680 {
00681     char c;
00682 
00683     do{
00684         c = file->Get();
00685     }while(is_digit(c));
00686     if(is_float_suffix(c))
00687         do{
00688             c = file->Get();
00689         }while(is_float_suffix(c));
00690     else if(is_eletter(c)){
00691         uint p = file->GetCurPos();
00692         c = file->Get();
00693         if(c == '+' || c == '-'){
00694              c = file->Get();
00695              if(!is_digit(c)){
00696                 file->Rewind(p);
00697                 token_len = int(p - top);
00698                 return token(Constant);
00699             }
00700         }
00701         else if(!is_digit(c)){
00702             file->Rewind(p);
00703             token_len = int(p - top);
00704             return token(Constant);
00705         }
00706 
00707         do{
00708             c = file->Get();
00709         }while(is_digit(c));
00710 
00711         while(is_float_suffix(c))
00712             c = file->Get();
00713     }
00714 
00715     file->Unget();
00716     token_len = int(file->GetCurPos() - top + 1);
00717     return token(Constant);
00718 }
00719 
00720 // ReadLineDirective() simply ignores a line beginning with '#'
00721 
00722 bool Lex::ReadLineDirective()
00723 {
00724     char c;
00725 
00726     do{
00727         c = file->Get();
00728     }while(c != '\n' && c != '\0');
00729     return TRUE;
00730 }
00731 
00732 int Lex::ReadIdentifier(uint top)
00733 {
00734     char c;
00735 
00736     do{
00737         c = file->Get();
00738     }while(is_letter(c) || is_digit(c));
00739 
00740     uint len = file->GetCurPos() - top;
00741     token_len = int(len);
00742     file->Unget();
00743 
00744     return Screening((char*)file->Read(top), int(len));
00745 }
00746 
00747 /*
00748   This table is a list of reserved key words.
00749   Note: alphabetical order!
00750 */
00751 static struct rw_table {
00752     char*       name;
00753     long        value;
00754 } table[] = {
00755 #if defined(__GNUG__) || defined(_GNUG_SYNTAX)
00756     { "__alignof__",    token(SIZEOF) },
00757     { "__asm__",        token(ATTRIBUTE) },
00758     { "__attribute__",  token(ATTRIBUTE) },
00759     { "__const",        token(CONST) },
00760     { "__extension__",  token(EXTENSION) },
00761     { "__inline__",     token(INLINE) },
00762     { "__restrict",     token(Ignore) },
00763     { "__signed",       token(SIGNED) },
00764     { "__signed__",     token(SIGNED) },
00765 #endif
00766     { "asm",            token(ATTRIBUTE) },
00767     { "auto",           token(AUTO) },
00768 #if !defined(_MSC_VER) || (_MSC_VER >= 1100)
00769     { "bool",           token(BOOLEAN) },
00770 #endif
00771     { "break",          token(BREAK) },
00772     { "case",           token(CASE) },
00773     { "catch",          token(CATCH) },
00774     { "char",           token(CHAR) },
00775     { "class",          token(CLASS) },
00776     { "const",          token(CONST) },
00777     { "continue",       token(CONTINUE) },
00778     { "default",        token(DEFAULT) },
00779     { "delete",         token(DELETE) },
00780     { "do",             token(DO) },
00781     { "double",         token(DOUBLE) },
00782     { "else",           token(ELSE) },
00783     { "enum",           token(ENUM) },
00784     { "extern",         token(EXTERN) },
00785     { "float",          token(FLOAT) },
00786     { "for",            token(FOR) },
00787     { "friend",         token(FRIEND) },
00788     { "goto",           token(GOTO) },
00789     { "if",             token(IF) },
00790     { "inline",         token(INLINE) },
00791     { "int",            token(INT) },
00792     { "long",           token(LONG) },
00793     { "metaclass",      token(METACLASS) },     // OpenC++
00794     { "mutable",        token(MUTABLE) },
00795     { "namespace",      token(NAMESPACE) },
00796     { "new",            token(NEW) },
00797     { "operator",       token(OPERATOR) },
00798     { "private",        token(PRIVATE) },
00799     { "protected",      token(PROTECTED) },
00800     { "public",         token(PUBLIC) },
00801     { "register",       token(REGISTER) },
00802     { "return",         token(RETURN) },
00803     { "short",          token(SHORT) },
00804     { "signed",         token(SIGNED) },
00805     { "sizeof",         token(SIZEOF) },
00806     { "static",         token(STATIC) },
00807     { "struct",         token(STRUCT) },
00808     { "switch",         token(SWITCH) },
00809     { "template",       token(TEMPLATE) },
00810     { "this",           token(THIS) },
00811     { "throw",          token(THROW) },
00812     { "try",            token(TRY) },
00813     { "typedef",        token(TYPEDEF) },
00814     { "typename",       token(CLASS) }, // it's not identical to class, but...
00815     { "union",          token(UNION) },
00816     { "unsigned",       token(UNSIGNED) },
00817     { "using",          token(USING) },
00818     { "virtual",        token(VIRTUAL) },
00819     { "void",           token(VOID) },
00820     { "volatile",       token(VOLATILE) },
00821     { "while",          token(WHILE) },
00822     /* NULL slot */
00823 };
00824 
00825 static void InitializeOtherKeywords()
00826 {
00827     static BOOL done = FALSE;
00828 
00829     if(done)
00830         return;
00831     else
00832         done = TRUE;
00833 
00834     if(regularCpp)
00835         for(unsigned int i = 0; i < sizeof(table) / sizeof(table[0]); ++i)
00836             if(table[i].value == METACLASS){
00837                 table[i].value = Identifier;
00838                 break;
00839             }
00840 
00841 #if defined(_MSC_VER)
00842     assert(Lex::RecordKeyword("cdecl", Ignore));
00843     assert(Lex::RecordKeyword("_cdecl", Ignore));
00844     assert(Lex::RecordKeyword("__cdecl", Ignore));
00845 
00846     assert(Lex::RecordKeyword("_fastcall", Ignore));
00847     assert(Lex::RecordKeyword("__fastcall", Ignore));
00848     
00849     assert(Lex::RecordKeyword("_based", Ignore));
00850     assert(Lex::RecordKeyword("__based", Ignore));
00851 
00852     assert(Lex::RecordKeyword("_asm", ASM));
00853     assert(Lex::RecordKeyword("__asm", ASM));
00854 
00855     assert(Lex::RecordKeyword("_inline", INLINE));
00856     assert(Lex::RecordKeyword("__inline", INLINE));
00857 
00858     assert(Lex::RecordKeyword("_stdcall", Ignore));
00859     assert(Lex::RecordKeyword("__stdcall", Ignore));
00860 
00861     assert(Lex::RecordKeyword("__declspec", DECLSPEC));
00862 
00863     assert(Lex::RecordKeyword("__int8",  CHAR));
00864     assert(Lex::RecordKeyword("__int16", SHORT));
00865     assert(Lex::RecordKeyword("__int32", INT));
00866     assert(Lex::RecordKeyword("__int64",  INT64));
00867 #endif
00868 }
00869 
00870 int Lex::Screening(char *identifier, int len)
00871 {
00872     struct rw_table     *low, *high, *mid;
00873     int                 c, token;
00874 
00875     low = table;
00876     high = &table[sizeof(table) / sizeof(table[0]) - 1];
00877     while(low <= high){
00878         mid = low + (high - low) / 2;
00879         if((c = strncmp(mid->name, identifier, len)) == 0)
00880             if(mid->name[len] == '\0')
00881                 return mid->value;
00882             else
00883                 high = mid - 1;
00884         else if(c < 0)
00885             low = mid + 1;
00886         else
00887             high = mid - 1;
00888     }
00889 
00890     if(user_keywords == nil)
00891         user_keywords = new HashTable;
00892 
00893     if(user_keywords->Lookup(identifier, len, (HashValue*)&token))
00894         return token;
00895 
00896     return token(Identifier);
00897 }
00898 
00899 int Lex::ReadSeparator(char c, uint top)
00900 {
00901     char c1 = file->Get();
00902 
00903     token_len = 2;
00904     if(c1 == '='){
00905         switch(c){
00906         case '*' :
00907         case '/' :
00908         case '%' :
00909         case '+' :
00910         case '-' :
00911         case '&' :
00912         case '^' :
00913         case '|' :
00914             return token(AssignOp);
00915         case '=' :
00916         case '!' :
00917             return token(EqualOp);
00918         case '<' :
00919         case '>' :
00920             return token(RelOp);
00921         default :
00922             file->Unget();
00923             token_len = 1;
00924             return SingleCharOp(c);
00925         }
00926     }
00927     else if(c == c1){
00928         switch(c){
00929         case '<' :
00930         case '>' :
00931             if(file->Get() != '='){
00932                 file->Unget();
00933                 return token(ShiftOp);
00934             }
00935             else{
00936                 token_len = 3;
00937                 return token(AssignOp);
00938             }
00939         case '|' :
00940             return token(LogOrOp);
00941         case '&' :
00942             return token(LogAndOp);
00943         case '+' :
00944         case '-' :
00945             return token(IncOp);
00946         case ':' :
00947             return token(Scope);
00948         case '.' :
00949             if(file->Get() == '.'){
00950                 token_len = 3;
00951                 return token(Ellipsis);
00952             }
00953             else
00954                 file->Unget();
00955         case '/' :
00956             return ReadComment(c1, top);
00957         default :
00958             file->Unget();
00959             token_len = 1;
00960             return SingleCharOp(c);
00961         }
00962     }
00963     else if(c == '.' && c1 == '*')
00964         return token(PmOp);
00965     else if(c == '-' && c1 == '>')
00966         if(file->Get() == '*'){
00967             token_len = 3;
00968             return token(PmOp);
00969         }
00970         else{
00971             file->Unget();
00972             return token(ArrowOp);
00973         }
00974     else if(c == '/' && c1 == '*')
00975         return ReadComment(c1, top);
00976     else{
00977         file->Unget();
00978         token_len = 1;
00979         return SingleCharOp(c);
00980     }
00981 
00982     cerr << "*** An invalid character has been found! ("
00983          << (int)c << ',' << (int)c1 << ")\n";
00984     return token(BadToken);
00985 }
00986 
00987 int Lex::SingleCharOp(unsigned char c)
00988 {
00989                         /* !"#$%&'()*+,-./0123456789:;<=>? */
00990     static char valid[] = "x   xx xxxxxxxx          xxxxxx";
00991 
00992     if('!' <= c && c <= '?' && valid[c - '!'] == 'x')
00993         return c;
00994     else if(c == '[' || c == ']' || c == '^')
00995         return c;
00996     else if('{' <= c && c <= '~')
00997         return c;
00998     else
00999         return token(BadToken);
01000 }
01001 
01002 int Lex::ReadComment(char c, uint top) {
01003     uint len = 0;
01004     if (c == '*')       // a nested C-style comment is prohibited.
01005         do {
01006             c = file->Get();
01007             if (c == '*') {
01008                 c = file->Get();
01009                 if (c == '/') {
01010                     len = 1;
01011                     break;
01012                 }
01013                 else
01014                     file->Unget();
01015             }
01016         }while(c != '\0');
01017     else /* if (c == '/') */
01018         do {
01019             c = file->Get();
01020         }while(c != '\n' && c != '\0');
01021 
01022     len += file->GetCurPos() - top;
01023     token_len = int(len);
01024     Leaf* node = new Leaf((char*)file->Read(top), int(len));
01025     comments = Ptree::Snoc(comments, node);
01026     return Ignore;
01027 }
01028 
01029 Ptree* Lex::GetComments() {
01030     Ptree* c = comments;
01031     comments = nil;
01032     return c;
01033 }
01034 
01035 Ptree* Lex::GetComments2() {
01036     return comments;
01037 }
01038 
01039 #ifdef TEST
01040 #include <stdio.h>
01041 
01042 main()
01043 {
01044     int   i = 0;
01045     Token token;
01046 
01047     Lex lex(new ProgramFromStdin);
01048     for(;;){
01049 //      int t = lex.GetToken(token);
01050         int t = lex.LookAhead(i++, token);
01051         if(t == 0)
01052             break;
01053         else if(t < 128)
01054             printf("%c (%x): ", t, t);
01055         else
01056             printf("%-10.10s (%x): ", (char*)t, t);
01057 
01058         putchar('"');
01059         while(token.len-- > 0)
01060             putchar(*token.ptr++);
01061 
01062         puts("\"");
01063     };
01064 }
01065 #endif
01066 
01067 /*
01068 
01069 line directive:
01070 ^"#"{blank}*{digit}+({blank}+.*)?\n
01071 
01072 pragma directive:
01073 ^"#"{blank}*"pragma".*\n
01074 
01075 Constant        {digit}+{int_suffix}*
01076                 "0"{xletter}{hexdigit}+{int_suffix}*
01077                 {digit}*\.{digit}+{float_suffix}*
01078                 {digit}+\.{float_suffix}*
01079                 {digit}*\.{digit}+"e"("+"|"-")*{digit}+{float_suffix}*
01080                 {digit}+\."e"("+"|"-")*{digit}+{float_suffix}*
01081                 {digit}+"e"("+"|"-")*{digit}+{float_suffix}*
01082 
01083 CharConst       \'([^'\n]|\\[^\n])\'
01084 
01085 StringL         \"([^"\n]|\\["\n])*\"
01086 
01087 Identifier      {letter}+({letter}|{digit})*
01088 
01089 AssignOp        *= /= %= += -= &= ^= <<= >>=
01090 
01091 EqualOp         == !=
01092 
01093 RelOp           <= >=
01094 
01095 ShiftOp         << >>
01096 
01097 LogOrOp         ||
01098 
01099 LogAndOp        &&
01100 
01101 IncOp           ++ --
01102 
01103 Scope           ::
01104 
01105 Ellipsis        ...
01106 
01107 PmOp            .* ->*
01108 
01109 ArrowOp         ->
01110 
01111 others          !%^&*()-+={}|~[];:<>?,./
01112 
01113 BadToken        others
01114 
01115 */