Scan.c

/*********************************************************************
 *  NAME:          Benjamin Fowler
 *  NUMBER:        02251132
 *  SUBJECT:       Modern Compiler Construction
 *  INSTRUCTOR:    Dr Wayne Kelly
 *********************************************************************
 *  MODULE:        Scan.c
 *  PURPOSE:       Implements the Scan module.
 *  DATE STARTED:  5th March, 2000.
 *  LAST EDITED:   10th May, 2000.
 *
 *  REVISION HISTORY:
 *
 *   5th March:    Wrote declarations, some work on implementation.
 *   6th March:    Wrote scanner, fixed some bugs.
 *   8th March:    Added refinement: minimial perfect hashing for 
 *                  parsing of reserved words.  This should make
 *                  the efficiency of scanning reserved words more
 *                  more acceptable.  
 *                 Enable by #defining FAST_RESERVED_WORDS
 *   23rd March:   Moved declarations/definitions of printToken()
 *                  to Util.[ch]
 *   1st April:    Fixed a scanner bug in Assignment 1 relating to 
 *                  comment parsing.
 *   10th May:     Fixed a bug where the EchoSource option wasn't
 *                  working.
 *********************************************************************/


#include "Globals.h"
#include "Scan.h"
#include "Util.h"

/* Uncomment this to enable fast reserved word scanning */

/* #define FAST_RESERVED_WORDS */


/* Definitions relevant to the operation to the scanner */

#define BUFFERLENGTH 256

char lineText[BUFFERLENGTH];      /* text of the current line */
char tokenString[MAXTOKENLEN+1];  /* lexeme of the current token */
int  lineIndex = 0;               /* current position in this line */
int  lineSize = 0;                /* size of the current line, in chars */

/* Here are the various state that the lexer DFSA can be in */

typedef enum
{
    START, INNUM, INID, INDIV, INCOMMENT, INCOMMENT2, INNE, INLT, INGT,
	INEQ, ERRORSTATE, DONE
} LexerState;


/*
 * Here are the language reserved words, together with their corresponding
 *  tokens.
 */

char reservedWords[MAXRESERVED][7] =
  { "int", "void", "if", "else", "return", "while" };

TokenType reservedWordsTokens[MAXRESERVED] = 
  { INT, VOID, IF, ELSE, RETURN, WHILE };

/***********************************************************************
  The following code is automatically generated by the gperf(1) 
   minimal perfect hash function generator tool.
 ***********************************************************************/

/* C code produced by gperf version 2.7 */
/* Command-line: /tmp/n2251132/gperf-2.7/src/gperf -t gperfinput.txt  */

struct fastReservedWords { char *name; TokenType tok; };

#define TOTAL_KEYWORDS 6
#define MIN_WORD_LENGTH 2
#define MAX_WORD_LENGTH 6
#define MIN_HASH_VALUE 2
#define MAX_HASH_VALUE 9
/* maximum key range = 8, duplicates = 0 */

#ifdef __GNUC__
__inline
#endif
static unsigned int
hash (str, len)
     register const char *str;
     register unsigned int len;
{
  static unsigned char asso_values[] =
    {
     10,10,10,10,10,10,10,10,10,10,
     10,10,10,10,10,10,10,10,10,10,
     10,10,10,10,10,10,10,10,10,10,
     10,10,10,10,10,10,10,10,10,10,
     10,10,10,10,10,10,10,10,10,10,
     10,10,10,10,10,10,10,10,10,10,
     10,10,10,10,10,10,10,10,10,10,
     10,10,10,10,10,10,10,10,10,10,
     10,10,10,10,10,10,10,10,10,10,
     10,10,10,10,10,10,10,10,10,10,
      5, 0, 0,10,10, 0,10,10,10,10,
      0,10,10,10, 0,10, 0,10, 0, 0,
     10,10,10,10,10,10,10,10,10,10,
     10,10,10,10,10,10,10,10,10,10,
     10,10,10,10,10,10,10,10,10,10,
     10,10,10,10,10,10,10,10,10,10,
     10,10,10,10,10,10,10,10,10,10,
     10,10,10,10,10,10,10,10,10,10,
     10,10,10,10,10,10,10,10,10,10,
     10,10,10,10,10,10,10,10,10,10,
     10,10,10,10,10,10,10,10,10,10,
     10,10,10,10,10,10,10,10,10,10,
     10,10,10,10,10,10,10,10,10,10,
     10,10,10,10,10,10,10,10,10,10,
     10,10,10,10,10,10,10,10,10,10,
     10,10,10,10,10,10
    };
  return len + asso_values[(unsigned char)str[len - 1]] + asso_values[(unsigned char)str[0]];
}

#ifdef __GNUC__
__inline
#endif
struct fastReservedWords *
in_word_set (str, len)
     register const char *str;
     register unsigned int len;
{
  static struct fastReservedWords wordlist[] =
    {
      {"", 0}, {"", 0},
      {"if", IF},
      {"int", INT},
      {"else", ELSE},
      {"while", WHILE},
      {"return", RETURN},
      {"", 0}, {"", 0},
      {"void", VOID}
    };

  if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH)
    {
      register int key = hash (str, len);

      if (key <= MAX_HASH_VALUE && key >= 0)
        {
          register const char *s = wordlist[key].name;

          if (*str == *s && !strcmp (str + 1, s + 1))
            return &wordlist[key];
        }
    }
  return 0;
}

/**** END OF AUTO-GENERATED CODE ***************************************/


/*
 * NAME:     LookupReservedWord()
 * PURPOSE:  Looks up identifiers to check to see if they're reserved words.
 *
 *  If the routine finds a match, the corresponding token is returned, else
 *   the token "ID" is returned instead.
 */


TokenType LookupReservedWord(char *lexeme)
{   
#ifdef FAST_RESERVED_WORDS

    struct fastReservedWords *rWord;

    rWord = in_word_set(lexeme, strlen(lexeme));

    if (rWord)
        return (rWord->tok);
    else
        return ID;

#else

    int i;

    for (i=0; i<MAXRESERVED; ++i)
    {
        if (!strcmp(lexeme, reservedWords[i]))
            return reservedWordsTokens[i];
    }

    return ID;
#endif
}


/*
 * NANE:    getNextChar()
 * PURPOSE: Returns the next character from the source file.
 *
 *  The reader will note that this code, and the "lookahead" mechanism
 *   bears a remarkable resemblence to that of Louden's in "Compiler
 *   Construction: Principles and Practice".
 *
 *  Louden's code is as good as it gets, so I'd be wasting my time
 *   reinventing the wheel.  Therefore, some snippets of code are
 *   borrowed from Louden.
 */

static char getNextChar()
{
    /* Have we run out of characters? */
    if (lineIndex >= lineSize)
    {
	++lineno;

	/* get a new line and return a character */
	if (fgets(lineText, BUFFERLENGTH-1, source))
	{
	    lineSize = strlen(lineText);
	    lineIndex = 0;
	    
	    /*
	     * If EchoSource is TRUE, we need to display source lines to
	     *  standard output.
	     */
	    if (EchoSource)
		fprintf(listing, "SOURCE: %5d: %s", lineno, lineText);

	    return lineText[lineIndex++];
	}
	else
	    return EOF;
    }

    return lineText[lineIndex++];
}


static void ungetNextChar()
{
    --lineIndex;
}


/*
 * NAME:     getToken()
 * PURPOSE:  Returns the next token in the source file.
 *
 *  This stuff here is mostly my code.  It tokenises valid C- source
 *   files (rather than Tiny C, like Louden's) and sports a minor
 *   refinement, in that is uses an automatically-generated minimal
 *   perfect hash function to determine if identifiers are reserved
 *   words.
 */

TokenType getToken(void)
{
    int        tokenIndex = 0;   /* index into tokenString    */
    TokenType  currentToken;     /* token to be returned      */
    LexerState state = START;    /* FSA state                 */
    int        save;             /* save char to tokenString? */

    int        c;             /* character under examination */
    

    while (state != DONE)
    {
	c = getNextChar();
	save = TRUE;      /* save this character into lexeme by default */

	switch(state)
	{
	case START:
	    if (isdigit(c))
		state = INNUM;
	    else if (isalpha(c))
		state = INID;
	    else if (c == '/')
		state = INDIV;
	    else if (c == '!')
		state = INNE;
	    else if (c == '<')
		state = INLT;
	    else if (c == '>')
		state = INGT;
	    else if (c == '=')
		state = INEQ;
	    else if ((c == '\n') || (c == '\t') || (c == ' '))
		save = FALSE;
	    else
	    {
		/*
		 * We can only now have one-character symbols. The state
		 *  machine has to return a symbol, or ERROR.
		 */
		state = DONE;   /* It's almost time for us to bail out */
		switch (c)
		{
		case EOF:
		    save = FALSE;
		    currentToken = ENDOFFILE;
		    break;
		case '+':
		    currentToken = PLUS;
		    break;
		case '-':
		    currentToken = MINUS;
		    break;
		case '*':
		    currentToken = TIMES;
		    break;
		case ';':
		    currentToken = SEMI;
		    break;
		case ',':
		    currentToken = COMMA;
		    break;
		case '[':
		    currentToken = LSQUARE;
		    break;
		case ']':
		    currentToken = RSQUARE;
		    break;
		case '(':
		    currentToken = LPAREN;
		    break;
		case ')':
		    currentToken = RPAREN;
		    break;
		case '{':
		    currentToken = LBRACE;
		    break;
		case '}':
		    currentToken = RBRACE;
		    break;
		default:
		    currentToken = ERROR;
		    break;
		}  /* switch(c) */
	    } /* if..then..else */

	    break;

	case INNE:
	    state = DONE;
	    
	    if (c == '=')
		currentToken = NE;
	    else
	    {
		/* back up in the input */
		ungetNextChar();
		save = FALSE;
		currentToken = ERROR;
	    }
	    break;
	    
	case INLT:
	    state = DONE;

	    if (c == '=')
		currentToken = LTE;
	    else
	    {
		/*
		 * Ok, we haven't scanned a <= symbol: put the next char
		 *  back for the next token
		 */
		ungetNextChar();
		save = FALSE;
		currentToken = LT;
	    }
	    break;

	case INGT:
	    state = DONE;
	    
	    if (c == '=')
		currentToken = GTE;
	    else
	    {
		/*
		 * Ok, we haven't scanned a >= symbol: put the next char
		 *  back for the next token
		 */
		ungetNextChar();
		save = FALSE;
		currentToken = GT;
	    }
	    break;
	    
	case INEQ:
	    state = DONE;
	    
	    if (c == '=')
		currentToken = EQ;
	    else
	    {
		/*
		 * Ok, we haven't scanned a <= symbol: put the next char
		 *  back for the next token
		 */
		ungetNextChar();
		save = FALSE;
		currentToken = ASSIGN;
	    }
	    break;

	case INDIV:
	    if (c == '*')
	    {
		save = FALSE;
		state = INCOMMENT;
		tokenIndex -=1;     /* remove "/" from tokenString */
	    }
	    else
	    {
		ungetNextChar();
		save = FALSE;
		state = DONE;
		currentToken = DIVIDE;
	    }
	    break;

	case INCOMMENT:
	    save = FALSE;
	    
	    if (c == '*')
		state = INCOMMENT2;

	    /*
	     * This next bit of code fixes a bug where the scanner goes into
	     *  an infinite loop if an EOF is encountered inside a comment
	     */
	    
	    else if (c == EOF)
		state = ERROR;
	    
	    break;

	case INCOMMENT2:
	    save = FALSE;

	    if (c == '/')
		state = START;
            /*
             * BUG FIX: if we're in state INCOMMENT2 and we encounter an
             *   asterisk, we want to remain in INCOMMENT2 - this becomes
             *   important when 2 or more astericks are immediately followed
             *   by a slash....
             */
            else if (c == '*')
                state = INCOMMENT2;  /* no change */
	    else
		state = INCOMMENT;
	    break;

	case INID:
	    if (!isalpha(c))
	    {
		/* back up */
		ungetNextChar();
		save = FALSE;
		state = DONE;
		currentToken = ID;
	    }

	    break;

	case INNUM:
	    if (!isdigit(c))
	    {
		/* back up */
		ungetNextChar();
		save = FALSE;
		state = DONE;
		currentToken = NUM;
	    }
	    break;

	case DONE:
	default:
	    fprintf(listing, "<<<SCANNER BUG>>: state = %d\n", state);
	    state = DONE;
	    currentToken = ERROR;
	    break;
	}  /* switch(state) */

	
	/* Append a character onto the tokenString (if it was asked for) */
	if ((save) && (tokenIndex <= MAXTOKENLEN))
	    tokenString[tokenIndex++] = c;
	
	if (state == DONE)
	{
	    /* null-terminate the string */
	    tokenString[tokenIndex] = '\0';
	    
            if (currentToken == ID)
                currentToken = LookupReservedWord(tokenString);
	}
	
    }  /* while (state != DONE) */
	
    /*
     * If we've enabled the TraceScan option, output a detailed trace
     *  of the lexical scanner's actions.
     */
    
    if (TraceScan)
    {
	fprintf(listing, "SCAN: %5d: ", lineno);
	printToken(currentToken, tokenString);
        fprintf(listing, "\n");
    }
    
    return currentToken;
}


/* END OF FILE */