Assembler/src/lexer.c
2023-02-25 19:41:58 +01:00

95 lines
3.2 KiB
C

/*
This code is part of the EIPA Platform
This code contains the implementations of all functions related to the lexical analysis of the EIPA Assembly file
*/
#include <stdio.h>
#include <string.h>
#include "../header/common.h"
void lexer(FILE *input_file, char tokens[][MAX_TOKEN_SIZE])
{
// Stores the current character we are examining
char current_char = 0;
// Stores at which token current_char is
unsigned int token_index = 0;
// Stores wether current_char is the beginning of a line, so that we can skip any indentation at that point
short unsigned int is_pos_line_start = 1;
// Loop trough all characters in the file
while ((current_char = fgetc(input_file)) != EOF)
{
if (is_pos_line_start)
{
// Loop trough Spaces and Tabs, to make empty lines and Indentation work
is_pos_line_start = 0;
while (current_char != EOF && (current_char == ASCII_SPACE || current_char == ASCII_TAB || current_char == ASCII_NEWLINE))
{
current_char = fgetc(input_file);
}
}
switch (current_char)
{
case EOF:
break;
case ASCII_TAB:
case ASCII_SPACE:
// This is an indice of a new token begining, so we probably need to increase token_index
// Loop to the characters until the next character fgetc() would read is not space or tab
while (PREFGETC(input_file) == ASCII_SPACE || PREFGETC(input_file) == ASCII_TAB)
{
current_char = fgetc(input_file);
}
/*
Between the Adress and the newline in an Instruction is usually no space.
Therefore the token_index gets increased, whenever a newline is found.
However, there can also be a space between the Adress and the newline.
To not increase the token_index 2 times, we need to not increase it here if the next character is a \n
*/
if (PREFGETC(input_file) != ';' && PREFGETC(input_file) != ASCII_NEWLINE)
{
token_index++;
}
break;
case ASCII_NEWLINE:
// This is a indice of a new token -> increase token_index
token_index++;
// This also is a indice of a new instruction beginning
// in the tokens array, instructions are seperated by semicolons
tokens[token_index][0] = ';';
// Since the Instruction seperator (';') is also a token, we need to increase token_index again
token_index++;
is_pos_line_start = 1;
break;
case ';':
// Loop over the comment
while (PREFGETC(input_file) != ASCII_NEWLINE && PREFGETC(input_file) != '\0')
{
current_char = fgetc(input_file);
}
break;
default:
strncat(tokens[token_index], &current_char, 1);
break;
}
}
tokens[token_index + 1][0] = ';';
tokens[token_index + 2][0] = EOF;
}
void print_tokens(char tokens[][MAX_TOKEN_SIZE])
{
int token_index = 0;
while (tokens[token_index][0] != EOF)
{
printf("%s\n", tokens[token_index]);
token_index++;
}
}