package java_cup;
|
|
import java.util.Hashtable;
|
|
import java_cup.runtime.str_token;
|
import java_cup.runtime.token;
|
|
/** This class implements a small scanner (aka lexical analyzer or lexer) for
|
* the JavaCup specification. This scanner reads characters from standard
|
* input (System.in) and returns integers corresponding to the terminal
|
* number of the next token. Once end of input is reached the EOF token is
|
* returned on every subsequent call.<p>
|
* Tokens currently returned include: <pre>
|
* Symbol Constant Returned Symbol Constant Returned
|
* ------ ----------------- ------ -----------------
|
* "package" PACKAGE "import" IMPORT
|
* "code" CODE "action" ACTION
|
* "parser" PARSER "terminal" TERMINAL
|
* "non" NON "init" INIT
|
* "scan" SCAN "with" WITH
|
* "start" START ; SEMI
|
* , COMMA * STAR
|
* . DOT : COLON
|
* ::= COLON_COLON_EQUALS | BAR
|
* identifier ID {:...:} CODE_STRING
|
* "debug" DEBUG
|
* </pre>
|
* All symbol constants are defined in sym.java which is generated by
|
* JavaCup from parser.cup.<p>
|
*
|
* In addition to the scanner proper (called first via init() then with
|
* next_token() to get each token) this class provides simple error and
|
* warning routines and keeps a count of errors and warnings that is
|
* publicly accessible.<p>
|
*
|
* This class is "static" (i.e., it has only static members and methods).
|
*
|
* @version last updated: 11/25/95
|
* @author Scott Hudson
|
*/
|
public class lexer {
|
|
/*-----------------------------------------------------------*/
|
/*--- Constructor(s) ----------------------------------------*/
|
/*-----------------------------------------------------------*/
|
|
/** The only constructor is private, so no instances can be created. */
|
private lexer() { }
|
|
/*-----------------------------------------------------------*/
|
/*--- Static (Class) Variables ------------------------------*/
|
/*-----------------------------------------------------------*/
|
|
/** First character of lookahead. */
|
protected static int next_char;
|
|
/** Second character of lookahead. */
|
protected static int next_char2;
|
|
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
|
|
/** EOF constant. */
|
protected static final int EOF_CHAR = -1;
|
|
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
|
|
/** Table of keywords. Keywords are initially treated as identifiers.
|
* Just before they are returned we look them up in this table to see if
|
* they match one of the keywords. The string of the name is the key here,
|
* which indexes Integer objects holding the symbol number.
|
*/
|
protected static Hashtable keywords = new Hashtable(23);
|
|
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
|
|
/** Table of single character symbols. For ease of implementation, we
|
* store all unambiguous single character tokens in this table of Integer
|
* objects keyed by Integer objects with the numerical value of the
|
* appropriate char (currently Character objects have a bug which precludes
|
* their use in tables).
|
*/
|
protected static Hashtable char_symbols = new Hashtable(11);
|
|
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
|
|
/** Current line number for use in error messages. */
|
protected static int current_line = 1;
|
|
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
|
|
/** Character position in current line. */
|
protected static int current_position = 1;
|
|
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
|
|
/** Count of total errors detected so far. */
|
public static int error_count = 0;
|
|
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
|
|
/** Count of warnings issued so far */
|
public static int warning_count = 0;
|
|
/*-----------------------------------------------------------*/
|
/*--- Static Methods ----------------------------------------*/
|
/*-----------------------------------------------------------*/
|
|
/** Initialize the scanner. This sets up the keywords and char_symbols
|
* tables and reads the first two characters of lookahead.
|
*/
|
public static void init() throws java.io.IOException
|
{
|
/* set up the keyword table */
|
keywords.put("package", new Integer(sym.PACKAGE));
|
keywords.put("import", new Integer(sym.IMPORT));
|
keywords.put("code", new Integer(sym.CODE));
|
keywords.put("action", new Integer(sym.ACTION));
|
keywords.put("parser", new Integer(sym.PARSER));
|
keywords.put("terminal", new Integer(sym.TERMINAL));
|
keywords.put("non", new Integer(sym.NON));
|
keywords.put("init", new Integer(sym.INIT));
|
keywords.put("scan", new Integer(sym.SCAN));
|
keywords.put("with", new Integer(sym.WITH));
|
keywords.put("start", new Integer(sym.START));
|
keywords.put("debug", new Integer(sym.DEBUG));
|
|
/* set up the table of single character symbols */
|
char_symbols.put(new Integer(';'), new Integer(sym.SEMI));
|
char_symbols.put(new Integer(','), new Integer(sym.COMMA));
|
char_symbols.put(new Integer('*'), new Integer(sym.STAR));
|
char_symbols.put(new Integer('.'), new Integer(sym.DOT));
|
char_symbols.put(new Integer('|'), new Integer(sym.BAR));
|
|
/* read two characters of lookahead */
|
next_char = System.in.read();
|
if (next_char == EOF_CHAR)
|
next_char2 = EOF_CHAR;
|
else
|
next_char2 = System.in.read();
|
}
|
|
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
|
|
/** Advance the scanner one character in the input stream. This moves
|
* next_char2 to next_char and then reads a new next_char2.
|
*/
|
protected static void advance() throws java.io.IOException
|
{
|
int old_char;
|
|
old_char = next_char;
|
next_char = next_char2;
|
if (next_char == EOF_CHAR)
|
next_char2 = EOF_CHAR;
|
else
|
next_char2 = System.in.read();
|
|
/* count this */
|
current_position++;
|
if (old_char == '\n')
|
{
|
current_line++;
|
current_position = 1;
|
}
|
}
|
|
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
|
|
/** Emit an error message. The message will be marked with both the
|
* current line number and the position in the line. Error messages
|
* are printed on standard error (System.err).
|
* @param message the message to print.
|
*/
|
public static void emit_error(String message)
|
{
|
System.err.println("Error at " + current_line + "(" + current_position +
|
"): " + message);
|
error_count++;
|
}
|
|
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
|
|
/** Emit a warning message. The message will be marked with both the
|
* current line number and the position in the line. Messages are
|
* printed on standard error (System.err).
|
* @param message the message to print.
|
*/
|
public static void emit_warn(String message)
|
{
|
System.err.println("Warning at " + current_line + "(" + current_position +
|
"): " + message);
|
warning_count++;
|
}
|
|
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
|
|
/** Determine if a character is ok to start an id.
|
* @param ch the character in question.
|
*/
|
protected static boolean id_start_char(int ch)
|
{
|
return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
|
(ch == '_');
|
|
// later need to deal with non-8-bit chars here
|
}
|
|
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
|
|
/** Determine if a character is ok for the middle of an id.
|
* @param ch the character in question.
|
*/
|
protected static boolean id_char(int ch)
|
{
|
return id_start_char(ch) || (ch >= '0' && ch <= '9');
|
}
|
|
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
|
|
/** Try to look up a single character symbol, returns -1 for not found.
|
* @param ch the character in question.
|
*/
|
protected static int find_single_char(int ch)
|
{
|
Integer result;
|
|
result = (Integer)char_symbols.get(new Integer((char)ch));
|
if (result == null)
|
return -1;
|
else
|
return result.intValue();
|
}
|
|
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
|
|
/** Handle swallowing up a comment. Both old style C and new style C++
|
* comments are handled.
|
*/
|
protected static void swallow_comment() throws java.io.IOException
|
{
|
/* next_char == '/' at this point */
|
|
/* is it a traditional comment */
|
if (next_char2 == '*')
|
{
|
/* swallow the opener */
|
advance(); advance();
|
|
/* swallow the comment until end of comment or EOF */
|
for (;;)
|
{
|
/* if its EOF we have an error */
|
if (next_char == EOF_CHAR)
|
{
|
emit_error("Specification file ends inside a comment");
|
return;
|
}
|
|
/* if we can see the closer we are done */
|
if (next_char == '*' && next_char2 == '/')
|
{
|
advance();
|
advance();
|
return;
|
}
|
|
/* otherwise swallow char and move on */
|
advance();
|
}
|
}
|
|
/* is its a new style comment */
|
if (next_char2 == '/')
|
{
|
/* swallow the opener */
|
advance(); advance();
|
|
/* swallow to '\n', '\f', or EOF */
|
while (next_char != '\n' && next_char != '\f' && next_char!=EOF_CHAR)
|
advance();
|
|
return;
|
|
}
|
|
/* shouldn't get here, but... if we get here we have an error */
|
emit_error("Malformed comment in specification -- ignored");
|
advance();
|
}
|
|
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
|
|
/** Swallow up a code string. Code strings begin with "{:" and include
|
all characters up to the first occurrence of ":}" (there is no way to
|
include ":}" inside a code string). The routine returns an str_token
|
object suitable for return by the scanner.
|
*/
|
protected static token do_code_string() throws java.io.IOException
|
{
|
StringBuffer result = new StringBuffer();
|
|
/* at this point we have lookahead of "{:" -- swallow that */
|
advance(); advance();
|
|
/* save chars until we see ":}" */
|
while (!(next_char == ':' && next_char2 == '}'))
|
{
|
/* if we have run off the end issue a message and break out of loop */
|
if (next_char == EOF_CHAR)
|
{
|
emit_error("Specification file ends inside a code string");
|
break;
|
}
|
|
/* otherwise record the char and move on */
|
result.append(new Character((char)next_char));
|
advance();
|
}
|
|
/* advance past the closer and build a return token */
|
advance(); advance();
|
return new str_token(sym.CODE_STRING, result.toString());
|
}
|
|
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
|
|
/** Process an identifier. Identifiers begin with a letter, underscore,
|
* or dollar sign, which is followed by zero or more letters, numbers,
|
* underscores or dollar signs. This routine returns an str_token suitable
|
* for return by the scanner.
|
*/
|
protected static token do_id() throws java.io.IOException
|
{
|
StringBuffer result = new StringBuffer();
|
String result_str;
|
Integer keyword_num;
|
char buffer[] = new char[1];
|
|
/* next_char holds first character of id */
|
buffer[0] = (char)next_char;
|
result.append(buffer,0,1);
|
advance();
|
|
/* collect up characters while they fit in id */
|
while(id_char(next_char))
|
{
|
buffer[0] = (char)next_char;
|
result.append(buffer,0,1);
|
advance();
|
}
|
|
/* extract a string and try to look it up as a keyword */
|
result_str = result.toString();
|
keyword_num = (Integer)keywords.get(result_str);
|
|
/* if we found something, return that keyword */
|
if (keyword_num != null)
|
return new token(keyword_num.intValue());
|
|
/* otherwise build and return an id token with an attached string */
|
return new str_token(sym.ID, result_str);
|
}
|
|
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
|
|
/** Return one token. This is the main external interface to the scanner.
|
* It consumes sufficient characters to determine the next input token
|
* and returns it. To help with debugging, this routine actually calls
|
* real_next_token() which does the work. If you need to debug the
|
* parser, this can be changed to call debug_next_token() which prints
|
* a debugging message before returning the token.
|
*/
|
public static token next_token() throws java.io.IOException
|
{
|
return real_next_token();
|
}
|
|
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
|
|
/** Debugging version of next_token(). This routine calls the real scanning
|
* routine, prints a message on System.out indicating what the token is,
|
* then returns it.
|
*/
|
public static token debug_next_token() throws java.io.IOException
|
{
|
token result = real_next_token();
|
System.out.println("# next_token() => " + result.sym);
|
return result;
|
}
|
|
/*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
|
|
/** The actual routine to return one token. This is normally called from
|
* next_token(), but for debugging purposes can be called indirectly from
|
* debug_next_token().
|
*/
|
protected static token real_next_token() throws java.io.IOException
|
{
|
int sym_num;
|
|
for (;;)
|
{
|
/* look for white space */
|
if (next_char == ' ' || next_char == '\t' || next_char == '\n' ||
|
next_char == '\f' || next_char == '\r')
|
{
|
/* advance past it and try the next character */
|
advance();
|
continue;
|
}
|
|
/* look for a single character symbol */
|
sym_num = find_single_char(next_char);
|
if (sym_num != -1)
|
{
|
/* found one -- advance past it and return a token for it */
|
advance();
|
return new token(sym_num);
|
}
|
|
/* look for : or ::= */
|
if (next_char == ':')
|
{
|
/* if we don't have a second ':' return COLON */
|
if (next_char2 != ':')
|
{
|
advance();
|
return new token(sym.COLON);
|
}
|
|
/* move forward and look for the '=' */
|
advance();
|
if (next_char2 == '=')
|
{
|
advance(); advance();
|
return new token(sym.COLON_COLON_EQUALS);
|
}
|
else
|
{
|
/* return just the colon (already consumed) */
|
return new token(sym.COLON);
|
}
|
}
|
|
/* look for a comment */
|
if (next_char == '/' && (next_char2 == '*' || next_char2 == '/'))
|
{
|
/* swallow then continue the scan */
|
swallow_comment();
|
continue;
|
}
|
|
/* look for start of code string */
|
if (next_char == '{' && next_char2 == ':')
|
return do_code_string();
|
|
/* look for an id or keyword */
|
if (id_start_char(next_char)) return do_id();
|
|
/* look for EOF */
|
if (next_char == EOF_CHAR) return new token(sym.EOF);
|
|
/* if we get here, we have an unrecognized character */
|
emit_warn("Unrecognized character '" +
|
new Character((char)next_char) + "'(" + next_char +
|
") -- ignored");
|
|
/* advance past it */
|
advance();
|
}
|
}
|
|
/*-----------------------------------------------------------*/
|
};
|