SE250:lab-9:dols008 2

From Marks Wiki
Jump to navigation Jump to search

My actual done-during-the-lab report is over here. This report is about the tokeniser that me and Braedon worked on throughout the rest of the day. I think this showed up a couple of bugs in my parser (not sure, bad memory) so I might update the code over there at some point.

To begin with it looked like we had to tokenise things on the fly, each time the parser called current. We figured out later that's not actually the case, but it works fine, so we left it as is. This task was even more confusing and lengthy than option 2. We still haven't finished it, and we've been doing it a simplistic (and poor performing) way. No strings supported yet, some redundant code and when combined with my parser it sometimes seg faults on bad input. There are probably other bugs too, but here's the code for now (I reserve the right to update it later). Requires the arraylist from a previous lab.

Tokenise.h

#ifndef TOKENISE_H
#define TOKENISE_H

typedef enum { false = 0, true = 1 } bool;

typedef enum {
T_SYMBOL,
T_IDENT,
T_INTEGER,
T_FLOAT,
T_STRING,
T_END,
T_NOTHING
} token_t;

typedef struct {
	token_t type;
	union {
		int symval;
		char* strval;
		int intval;
		double fltval;
	} val;
} Token;
typedef struct {
	Token current;
	char* pos;
} TokenStream;

/* The tokeniser interface */
void init_TokenStream( TokenStream*, char* ) ;
bool eqToken( Token, Token );
void print_token( Token );
Token current( TokenStream* );
void advance( TokenStream* );
void expect( TokenStream*, Token );
void error( char* fmt, ... );
bool isVariable( Token );
bool isConstant( Token );
bool end_of_tokens( TokenStream* );

extern Token TOK_ADD;
extern Token TOK_ASSIGN;
extern Token TOK_CLOSEBRACE;
extern Token TOK_DIV;
extern Token TOK_DO;
extern Token TOK_ELSE;
extern Token TOK_EQ;
extern Token TOK_ERROR;
extern Token TOK_IF;
extern Token TOK_MUL;
extern Token TOK_NOT;
extern Token TOK_OPENBRACE;
extern Token TOK_PCLOSE;
extern Token TOK_POPEN;
extern Token TOK_PRINT;
extern Token TOK_SEMICOLON;
extern Token TOK_SKIP;
extern Token TOK_SUB;
extern Token TOK_THEN;
extern Token TOK_WHILE;
extern Token TOK_NOTHING;

void init_predefined_tokens( );

#endif

tokenise.c


#include <stdlib.h>
#include <stdarg.h>
#include <string.h>
#include <assert.h>
#include <stdio.h>
#include <ctype.h>
#include "tokenise.h"
#include "arraylist.h"

/*
  ============
  Token stream
  ============
 */

ArrayList symbolTable;

void init_TokenStream( TokenStream* tokens, char* input ) {
  tokens->pos = input;
  tokens->current = TOK_NOTHING;
}

bool eqToken( Token a, Token b ) {
	if (a.type == b.type) {
		switch (a.type) {
			case T_SYMBOL:
				return a.val.symval == b.val.symval;
			// TODO: Figure out criteria for equality.
		}
	}
	return false;
}

bool alphaNum(char c) {
	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'z') || (c >= '0' && c <= '9');
}

Token trySymbol( TokenStream* tokens ) {
  int i;
  Symbol sym;
  bool found = false;
  Token tok = TOK_NOTHING;
  for (i = 0; i < arraylist_size(&symbolTable); ++i) {
    sym = arraylist_get(&symbolTable, i);
	if (strncmp(sym.name, tokens->pos, strlen(sym.name)) == 0) {
		if (!found || (found && strlen(sym.name) > strlen(arraylist_get(&symbolTable, tok.val.symval).name))) {
			found = true;
			tok = sym.tok;
		}
	}
  }
  return tok;
}

Token tryNumber( TokenStream* tokens ) {
	int i;
	Token tok;
	// TODO: negatives?
	for (i = 0; tokens->pos[i] >= '0' && tokens->pos[i] <= '9'; ++i);
	if (i == 0)
		return TOK_NOTHING;
	if (tokens->pos[i] == '.') {
		tok.type = T_FLOAT;
		tok.val.fltval = atof(tokens->pos);
		return tok;
	}
	else {
		tok.type = T_INTEGER;
		tok.val.intval = atoi(tokens->pos);
		return tok;
	}
	return TOK_NOTHING;
}

Token tryIdentifier( TokenStream* tokens ) {
	int i;
	Token tok;
	Symbol sym;
	if ((*(tokens->pos) >= 'a' && *(tokens->pos) <= 'z') || (*(tokens->pos) >= 'A' && *(tokens->pos) <= 'Z')) {
		char* str;
		for (i = 1; alphaNum(tokens->pos[i]); i++);
		str = (char*)malloc(i+1);
		strncpy(str, tokens->pos, i);
		str[i] = '\0'; // Null terminate identifier.
		tok.type = T_IDENT;
		tok.val.symval = -1;
		for (i = 0; i < arraylist_size(&symbolTable); ++i)
			if (strcmp(arraylist_get(&symbolTable, i).name, str) == 0)
				tok.val.symval = i;
		if (tok.val.symval == -1) {
			tok.val.symval = arraylist_size(&symbolTable);
			sym.name = str;
			sym.tok = tok;
			arraylist_push(&symbolTable, sym);
		} else
			free(str);
		return tok;
	}
	return TOK_NOTHING;
}

Token current( TokenStream* tokens ) {
  Token result;
  if (!eqToken(tokens->current, TOK_NOTHING))
	  return tokens->current;

  while (*(tokens->pos) == ' ' || *(tokens->pos) == '\t' || *(tokens->pos) == '\n')
	  tokens->pos++;

  result = trySymbol(tokens);
  if (!eqToken(result, TOK_NOTHING)) {
	  tokens->current = result;
	  return result;
  }
  result = tryNumber(tokens);
  if (!eqToken(result, TOK_NOTHING)) {
	  tokens->current = result;
	  return result;
  }
  result = tryIdentifier(tokens);
  if (!eqToken(result, TOK_NOTHING)) {
	  tokens->current = result;
	  return result;
  }

  return tokens->current;
}

void advance( TokenStream* tokens ) {
  Token tok = current( tokens );
  if (eqToken(tok, TOK_NOTHING))
	tokens->pos++;
  else if (tok.type == T_SYMBOL || tok.type == T_IDENT) {
	char* name = arraylist_get(&symbolTable, tok.val.symval).name;
    tokens->pos += strlen(name);
  } else if (tok.type == T_INTEGER) {
	  char buffer[12];
	  sprintf(buffer, "%d", tok.val.intval);
	  tokens->pos += strlen(buffer);
  } else if (tok.type == T_FLOAT) {
	  while ((*(tokens->pos) >= '0' && *(tokens->pos) <= '9') || *(tokens->pos) == '.')
		  ++(tokens->pos);
  } else
	error("Shit");

  tokens->current = TOK_NOTHING;
}

bool end_of_tokens( TokenStream* tokens ) {
  return eqToken( current( tokens ), TOK_NOTHING );
}

bool isVariable( Token tok ) {
  return tok.type == T_IDENT;
}

bool isConstant( Token tok ) {
  return tok.type == T_INTEGER || tok.type == T_FLOAT || tok.type == T_STRING;
}


void print_token( Token tok ) {
  if (tok.type == T_SYMBOL)
	printf("%s:sym ", arraylist_get(&symbolTable, tok.val.symval).name);
  else if (tok.type == T_IDENT)
	printf("%s:ident ", arraylist_get(&symbolTable, tok.val.symval).name);
  else if (tok.type == T_INTEGER)
	  printf("%d:int ", tok.val.intval);
  else if (tok.type == T_FLOAT)
	  printf("%f:float ", tok.val.fltval);
  else
    printf( "%d", tok.type );
}


void expect( TokenStream* tokens, Token expected ) {
  if( eqToken( current( tokens ), expected ) )
    advance( tokens );
  else {
    error( "Expected '%c', but found '%c', near \"%.40s\"",
           expected,
           current( tokens ),
           *tokens );
  }
}

Token TOK_ADD;
Token TOK_ASSIGN;
Token TOK_CLOSEBRACE;
Token TOK_DIV;
Token TOK_DO;
Token TOK_ELSE;
Token TOK_EQ;
Token TOK_ERROR;
Token TOK_IF;
Token TOK_MUL;
Token TOK_NOT;
Token TOK_OPENBRACE;
Token TOK_PCLOSE;
Token TOK_POPEN;
Token TOK_PRINT;
Token TOK_SEMICOLON;
Token TOK_SKIP;
Token TOK_SUB;
Token TOK_THEN;
Token TOK_WHILE;
Token TOK_NOTHING;

#define DEFINE_SYMBOL(var, str) var.type = T_SYMBOL; \
	var.val.symval = arraylist_size(&symbolTable); \
	symbol.name = str; \
	symbol.tok = var; \
	arraylist_push(&symbolTable, symbol);

void init_predefined_tokens( ) {
	Symbol symbol;
	arraylist_init(&symbolTable);
	
	DEFINE_SYMBOL(TOK_ADD, "+");
	DEFINE_SYMBOL(TOK_ASSIGN, "=");
	DEFINE_SYMBOL(TOK_CLOSEBRACE, "}");
	DEFINE_SYMBOL(TOK_DIV, "/");
	DEFINE_SYMBOL(TOK_DO, "do");
	DEFINE_SYMBOL(TOK_ELSE, "else");
	DEFINE_SYMBOL(TOK_EQ, "==");
	DEFINE_SYMBOL(TOK_ERROR, "ERROR");
	DEFINE_SYMBOL(TOK_IF, "if");
	DEFINE_SYMBOL(TOK_MUL, "*");
	DEFINE_SYMBOL(TOK_NOT, "!");
	DEFINE_SYMBOL(TOK_OPENBRACE, "{");
	DEFINE_SYMBOL(TOK_PCLOSE, ")");
	DEFINE_SYMBOL(TOK_POPEN, "(");
	DEFINE_SYMBOL(TOK_PRINT, "print");
	DEFINE_SYMBOL(TOK_SEMICOLON, ";");
	DEFINE_SYMBOL(TOK_SKIP, "skip");
	DEFINE_SYMBOL(TOK_SUB, "-");
	DEFINE_SYMBOL(TOK_THEN, "then");
	DEFINE_SYMBOL(TOK_WHILE, "while");
	DEFINE_SYMBOL(TOK_NOTHING, "nothing");
}

void error( char* fmt, ... ) {
  va_list args;
  va_start( args, fmt );
  vprintf( fmt, args );
  va_end( args );
  printf( "\n" );
}


void test_tokeniser( char* str ) {
  TokenStream tokens;
  init_TokenStream( &tokens, str );
  for( ; ! end_of_tokens( &tokens ); advance( &tokens ) ) {
    print_token( current( &tokens ) );
    printf( "\n" );
  }
}

Some Example input to try with the tokeniser-parser pair:

lol = 8; fred = lol + 27
print 1.2 + 2
if a + b then haha = 27 else { fred = a; thats = b }