Assembler/src/lexer.c

/*
This code is part of the EIPA Platform

This code contains the implementations of all functions related to the lexical analysis of the EIPA Assembly file
*/

#include <stdio.h>
#include <string.h>

#include "../header/common.h"

void lexer(FILE *input_file, char tokens[][MAX_TOKEN_SIZE])
{
    // Stores the current character we are examining
    char current_char = 0;
    // Stores at which token current_char is
    unsigned int token_index = 0;
    // Stores wether current_char is the beginning of a line, so that we can skip any indentation at that point
    short unsigned int is_pos_line_start = 1;

    // Loop trough all characters in the file
    while ((current_char = fgetc(input_file)) != EOF)
    {
        if (is_pos_line_start)
        {
            // Loop trough Spaces and Tabs, to make empty lines and Indentation work
            is_pos_line_start = 0;
            while (current_char != EOF && (current_char == ASCII_SPACE || current_char == ASCII_TAB || current_char == ASCII_NEWLINE))
            {
                current_char = fgetc(input_file);
            }
        }

        switch (current_char)
        {
        case EOF:
            break;
        case ASCII_TAB:
        case ASCII_SPACE:
            // This is an indice of a new token begining, so we probably need to increase token_index

            // Loop to the characters until the next character fgetc() would read is not space or tab
            while (PREFGETC(input_file) == ASCII_SPACE || PREFGETC(input_file) == ASCII_TAB)
            {
                current_char = fgetc(input_file);
            }
            /*
            Between the Adress and the newline in an Instruction is usually no space.
            Therefore the token_index gets increased, whenever a newline is found.
            However, there can also be a space between the Adress and the newline.
            To not increase the token_index 2 times, we need to not increase it here if the next character is a \n
            */
            if (PREFGETC(input_file) != ';' && PREFGETC(input_file) != ASCII_NEWLINE)
            {
                token_index++;
            }
            break;
        case ASCII_NEWLINE:
            // This is a indice of a new token -> increase token_index
            token_index++;

            // This also is a indice of a new instruction beginning
            // in the tokens array, instructions are seperated by semicolons
            tokens[token_index][0] = ';';

            // Since the Instruction seperator (';') is also a token, we need to increase token_index again
            token_index++;

            is_pos_line_start = 1;
            break;
        case ';':
            // Loop over the comment
            while (PREFGETC(input_file) != ASCII_NEWLINE && PREFGETC(input_file) != '\0')
            {
                current_char = fgetc(input_file);
            }
            break;
        default:
            strncat(tokens[token_index], &current_char, 1);
            break;
        }
    }
    tokens[token_index + 1][0] = ';';
    tokens[token_index + 2][0] = EOF;
}

void print_tokens(char tokens[][MAX_TOKEN_SIZE])
{
    int token_index = 0;
    while (tokens[token_index][0] != EOF)
    {
        printf("%s\n", tokens[token_index]);
        token_index++;
    }
}