Thickduck
Thickduck

Reputation: 19

Trying to make a lexer in C and the output is not desirable

I have been trying to make a programming language. I did some research and found out that I need to build a lexer, after some more painful research, I came up with a lexer written in C. but when I run make, and run the output file, the output is good till the first line (refer below) but the next line is not desirable.

lexer.h

#ifndef LEXER_H
#define LEXER_H
#include "token.h"

typedef struct LEXER_STRUCT {
    char c;
    unsigned int i;
    char *contents;

} lexer_T;

// init method
lexer_T *init_lexer(char *contents);
void lexer_advance(lexer_T *lexer);
void lexer_skip_whitespace(lexer_T *lexer);
token_T *lexer_get_next_token(lexer_T *lexer);
token_T *lexer_collect_string(lexer_T *lexer);
token_T *lexer_advance_with_token(lexer_T *lexer, token_T *token);
token_T *lexer_collect_id(lexer_T *lexer);
char *lexer_get_current_char_as_string(lexer_T *lexer);

#endif

lexer.c

#include "include/lexer.h"
#include "include/token.h"
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <stdio.h>

lexer_T *init_lexer(char *contents)
{
    lexer_T *lexer = calloc(1, sizeof(struct LEXER_STRUCT));
    lexer->contents = contents;
    lexer->i = 0;
    lexer->c = contents[lexer->i];

    return lexer;
}

void lexer_advance(lexer_T *lexer)
{
    if (lexer->c != '\0' && lexer->i < strlen(lexer->contents))
    {
        lexer->i += 1;

        lexer->c = lexer->contents[lexer->i];
    }
}

void lexer_skip_whitespace(lexer_T *lexer)
{
    while (lexer->c == ' ' || lexer->c == 10)
    {
        lexer_advance(lexer);
    }
}

token_T *lexer_get_next_token(lexer_T *lexer)
{
    while (lexer->c != '\0' && lexer->i < strlen(lexer->contents))
    {
        if (lexer->c == ' ' || lexer->c == 10)
        {
            lexer_skip_whitespace(lexer);
        }

        if (isalnum(lexer->c))
            return lexer_collect_id(lexer);

        if (lexer->c == '"')
        {
            lexer_collect_string(lexer);
        }

        switch (lexer->c)
        {
        case ';':
            return lexer_advance_with_token(lexer, init_token(TOKEN_SEMI, lexer_get_current_char_as_string(lexer)));
            break;
        case '=':
            return lexer_advance_with_token(lexer, init_token(TOKEN_EQUALS, lexer_get_current_char_as_string(lexer)));
            break;
        case '(':
            return lexer_advance_with_token(lexer, init_token(TOKEN_LBRACK, lexer_get_current_char_as_string(lexer)));
            break;
        case ')':
            return lexer_advance_with_token(lexer, init_token(TOKEN_RBRACK, lexer_get_current_char_as_string(lexer)));
            break;
        }
    }
    return (void *)0;
}

token_T *lexer_advance_with_token(lexer_T *lexer, token_T *token)
{
    lexer_advance(lexer);
    return token;
}

token_T *lexer_collect_string(lexer_T *lexer)
{
    char *value = calloc(1, sizeof(char));

    while (lexer->c != '"')
    {
        char *s = lexer_get_current_char_as_string(lexer);
        value = realloc(value, (strlen(value) * strlen(s) + 1) * sizeof(char));
        strcat(value, s);
        lexer_advance(lexer);
    }

    lexer_advance(lexer);

    return init_token(TOKEN_STRING, value);
}

token_T *lexer_collect_id(lexer_T *lexer)
{
    char *value = calloc(1, sizeof(char));

    while (isalnum(lexer->c))
    {
        char *s = lexer_get_current_char_as_string(lexer);
        value = realloc(value, (strlen(value) * strlen(s) + 1) * sizeof(char));
        strcat(value, s);

        lexer_advance(lexer);
    }

    lexer_advance(lexer);

    return init_token(TOKEN_STRING, value);
}

char *lexer_get_current_char_as_string(lexer_T *lexer)
{
    char *str = calloc(2, sizeof(char));
    str[0] = lexer->c;
    str[1] = '\0';

    return str;
}

main.c (this is the file im compiling)

#include <stdio.h>
#include "include/lexer.h"

int main() {
    lexer_T *lexer = init_lexer(
        "var name = \"thickduckplayz\";\n"
        "print(name)"
    );
    token_T *token = (void*)0;

    while ((token = lexer_get_next_token(lexer)) != (void*)0)
    {
        printf("TOKEN(%d, %s)\n", token->type, token->value);
    }
    
    return 0;
}

The Bug

For each time, it seems to put some random characters, is there like a memory leak because one time I saw a directory. I can provide more info if you want and the github repo is here.

Upvotes: 0

Views: 811

Answers (3)

chqrlie
chqrlie

Reputation: 144770

There are multiple problems in your code:

  • in lexer_get_next_token(), you should return lexer_collect_string(lexer); when the current character is '"'.
  • in lexer_collect_string(), you should first advance to the next character to skip the initial ".
  • in lexer_collect_string(), you should test for the end of string. As coded you have an infinite loop if the string is unterminated.
  • in lexer_collect_id() you call lexer_advance() after the last identifier character, potentially skipping the next token.
  • you should use '\n' instead of 10 for readability
  • you compute strlen(lexer->contents) far too many times. You should just test if lexer->contents[lexer->i] != '\n' or store the length when you initialize the lexer.

Here is a modified version:

#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

//#include "include/token.h"
//#include "include/lexer.h"

enum {
    TOKEN_SEMI,
    TOKEN_COMMA,
    TOKEN_EQUALS,
    TOKEN_LBRACK,
    TOKEN_RBRACK,
    TOKEN_STRING,
    TOKEN_NUMBER,
    TOKEN_ID,
};

typedef struct TOKEN_STRUCT {
    int type;
    char *value;
} token_T;

token_T *init_token(int token, char *value) {
    token_T *tok = calloc(1, sizeof(sizeof *tok));
    tok->type = token;
    tok->value = value;
    return tok;
}

const char *token_type(int type) {
    switch (type) {
    case TOKEN_SEMI:    return "SEMI";
    case TOKEN_COMMA:   return "COMMA";
    case TOKEN_EQUALS:  return "EQUALS";
    case TOKEN_LBRACK:  return "LBRACK";
    case TOKEN_RBRACK:  return "RBRACK";
    case TOKEN_STRING:  return "STRING";
    case TOKEN_NUMBER:  return "NUMBER";
    case TOKEN_ID:      return "ID";
    default:            return "Unknown";
    }
}

typedef struct LEXER_STRUCT {
    unsigned char c;
    unsigned int i, len;
    const char *contents;
} lexer_T;

lexer_T *init_lexer(const char *contents);
void lexer_advance(lexer_T *lexer);
token_T *lexer_get_next_token(lexer_T *lexer);
token_T *lexer_collect_single_char(lexer_T *lexer, int token_type);
token_T *lexer_collect_string(lexer_T *lexer);
token_T *lexer_collect_number(lexer_T *lexer);
token_T *lexer_collect_id(lexer_T *lexer);

lexer_T *init_lexer(const char *contents) {
    lexer_T *lexer = calloc(1, sizeof(struct LEXER_STRUCT));
    lexer->contents = contents;
    lexer->len = strlen(contents);
    lexer->i = 0;
    lexer->c = contents[lexer->i];
    return lexer;
}

void lexer_advance(lexer_T *lexer) {
    if (lexer->i < lexer->len) {
        lexer->i += 1;
        lexer->c = lexer->contents[lexer->i];
    }
}

token_T *lexer_collect_single_char(lexer_T *lexer, int token_type) {
    char *str = calloc(2, sizeof(char));
    str[0] = lexer->c;
    str[1] = '\0';
    lexer_advance(lexer);
    return init_token(token_type, str);
}

token_T *lexer_collect_string(lexer_T *lexer) {
    char *value = calloc(1, sizeof(char));
    unsigned int len = 0;

    lexer_advance(lexer);
    while (lexer->c != '\0' && lexer->c != '"') {
        value = realloc(value, (len + 1) * sizeof(char));
        value[len++] = lexer->c;
        lexer_advance(lexer);
    }
    value[len] = '\0';

    if (lexer->c == '"')
        lexer_advance(lexer);

    return init_token(TOKEN_STRING, value);
}

token_T *lexer_collect_id(lexer_T *lexer) {
    char *value = calloc(1, sizeof(char));
    unsigned int len = 0;

    while (isalnum(lexer->c) || lexer->c == '_') {
        value = realloc(value, (len + 1) * sizeof(char));
        value[len++] = lexer->c;
        lexer_advance(lexer);
    }
    value[len] = '\0';
    return init_token(TOKEN_ID, value);
}

token_T *lexer_collect_number(lexer_T *lexer) {
    char *value = calloc(1, sizeof(char));
    unsigned int len = 0;

    while (isdigit(lexer->c) || lexer->c == '.') {
        value = realloc(value, (len + 1) * sizeof(char));
        value[len++] = lexer->c;
        lexer_advance(lexer);
    }
    value[len] = '\0';
    return init_token(TOKEN_NUMBER, value);
}

token_T *lexer_get_next_token(lexer_T *lexer) {
    while (lexer->c != 0) {
        if (lexer->c == ' ' || lexer->c == '\n') {
            lexer_advance(lexer);
            continue;
        }
        if (isalpha(lexer->c) || lexer->c == '_')
            return lexer_collect_id(lexer);

        if (isdigit(lexer->c))
            return lexer_collect_number(lexer);

        if (lexer->c == '"')
            return lexer_collect_string(lexer);

        switch (lexer->c) {
        case ';':  return lexer_collect_single_char(lexer, TOKEN_SEMI);
        case ',':  return lexer_collect_single_char(lexer, TOKEN_COMMA);
        case '=':  return lexer_collect_single_char(lexer, TOKEN_EQUALS);
        case '(':  return lexer_collect_single_char(lexer, TOKEN_LBRACK);
        case ')':  return lexer_collect_single_char(lexer, TOKEN_RBRACK);
        }
    }
    return (void *)0;
}

int main() {
    lexer_T *lexer = init_lexer("var name = \"thickduckplayz\";\n"
                                "print(name, 1)");
    token_T *token = (void*)0;

    while ((token = lexer_get_next_token(lexer)) != (void*)0) {
        printf("TOKEN(%s, '%s')\n", token_type(token->type), token->value);
    }

    return 0;
}

Upvotes: 0

wkz
wkz

Reputation: 2263

Judging by the wording of your question, it sounds like your goal is not to learn how to write a lexer, but to build an actual language. In that case you might find it easier to use a parser generator, which will do a lot of the heavy lifting for you.

If you are implementing your language in C, the classic tools are lex (which generates a lexer) and yacc (which generates a parser). Today, people mostly use flex in place of lex, and bison in place of yacc. Here is an example of a lexer and parser for a C-like language built using these tools:

https://github.com/wkz/ply/blob/master/src/libply/lexer.l

https://github.com/wkz/ply/blob/master/src/libply/grammar.y

(Full disclosure: I'm the author)

Outside of C, there are other interesting tools available, pest.rs for example is pretty easy to get going with if you are in to Rust.

Upvotes: 1

rdtsc
rdtsc

Reputation: 303

In token.c you don't ever return anything from init_token which miraculously isn't breaking things, but should still be fixed. Using calloc is also unnecessary when you initialize the value of each field so it can be replaced with malloc.

token_T *init_token(int type, char *value)
{
    token_T *token = malloc(sizeof(struct TOKEN_STRUCT));
    token->type = type;
    token->value = value;
    return token;
}

In lexer.c you also never return the result of lexer_collect_string in lexer_get_next_token which is why you are not getting the results you expect and instead whatever happens to be left in the RAX register being read as the token address. So in lexer_get_next_token you should change the third if statement to

if (lexer->c == '"')
{
    return lexer_collect_string(lexer);
}

I'd also advise changing gcc $(files) $(flags) -o $(exec) in your makefile to gcc -Wall $(files) $(flags) -o $(exec) to avoid things like this in the future.

Upvotes: 1

Related Questions