HossamMohammed
HossamMohammed

Reputation: 3

Tiny Language compiler using python and regex

Hello stack overflow users I hope you having a good so I'm doing this tiny language compiler for my homework tried using regex but the output is so weird First of all, I get an Identifier called 't' which is not used in my input And it doesn't separate Identifier 'x' from the semicolon thanks in advance for your help

Here is my input

read x;   {input an integer }
     if  0 < x   then     {  don’t compute if x <= 0 }
        fact  := 1;
        repeat 
           fact  := fact *  x;
            x  := x  -  1 
        until  x  =  0;
        write  fact   {  output  factorial of x }
     end 

And that's my code using regex

 # -*- coding: utf-8 -*-
"""
Created on Wed May 13 04:11:06 2020

@author: PC
"""

class OwnCompiler (object):
    def __init__ (self,file):
        import re
        self.file=open(file,"r").readlines()
        self.symbols = {
                "+":"PLUS_OP",
                "-":"MINUS_OP",
                "*":"MUL_OP",
                "/":"DIV_OP",
                "=":"EQUAL_OP",
                "<":"LESS_OP",
                ">":"GREATER_OP",
                "(":"LEFT_PARENTHESIS",
                ")":"RIGHT_PARENTHESIS",
                ":=":"ASSIGN",
                ";":"SEMICOLON",
                }
        self.commentPattern = re.compile(r".*({\n*\s*.*\s*})")
        self.reservePattern = re.compile(r"\s*(read|write|if|then|else|end|repeat|until)+\s*(.*)(then)*")
        self.symbolPattern = re.compile(r".*(\+|\*|-|/|=|<|>|\(|\)|;)")
        self.identifierSymbol = re.compile(r".*(\w+)\s+(:=)\s+(.*)")

    def compileOutput(self):
        self.fileWrite=open("output.txt","w")
        self.fileWrite.write("Type        Token\n==================\n")
        for i in self.file :
            print(i)
            self.getComment(i)
            self.getReserveWord(i)
            self.getIdentify(i)
        self.fileWrite.close()#end
    def getComment(self,text):
        try:
            self.fileWrite.write("COMMENT        "+self.commentPattern.match(text).group(1)+"\n")
        except:
            print("NO_COMMENT")
    def getReserveWord(self,text):
        self.Compiled = self.reservePattern.match(text)
        try:
            self.fileWrite.write("RESERVE_WORD        "+self.Compiled.group(1)+"\n")
            self.getSymbols(self.Compiled.group(2))
            try:
                self.fileWrite.write("RESERVE_WORD        "+self.Compiled.group(3)+"\n")
            except:
                print("NO_RESERVE_WORD2")
        except:
            print("NO_RESERVE_WORD")
    def getSymbols(self,text):
        self.Compiled= self.symbolPattern.match(text)
        self.GOT_TOKEN= self.getTokensSymbols(self.Compiled.group())
        try:
            self.fileWrite.write(self.GOT_TOKEN+"        "+self.Compiled.group()+"\n")
        except:
            print("NO_SYMBOLS")
    def getIdentify(self,text):
        self.Compiled = self.identifierSymbol.match(text)
        try:
            self.fileWrite.write("IDENTIFIER        "+self.Compiled.group(1)+"\n")
            self.getSymbols(text)
            for i in self.Compiled.group(3):
                if i ==" " :
                    continue
                if self.isNumber(i):
                    self.fileWrite.write("NUMBER        ")
                else:
                    self.fileWrite.write("WORD        ")
                self.fileWrite.write(self.Compiled.group(3)+"\n")
        except:
            print("NO_IDENTIFIRES")
    def getTokensSymbols(self,symbol):
        try: 
            return self.symbols[symbol]
        except:
            print("NOT_DEFINED_IN_SYMBOL_DICT")
            return "UNKNOWN"

    def isNumber(self,text):
         try:
             int(text)
             return True
         except:
             return False

if __name__ == "__main__":
    instance = OwnCompiler("input.txt")
    instance.compileOutput()

And here is my output

Type        Token
==================
COMMENT        { Sample program in TINY language – computes factorial }
COMMENT        {input an integer }
RESERVE_WORD        read
UNKNOWN        x;
COMMENT        {  don’t compute if x <= 0 }
RESERVE_WORD        if
UNKNOWN        0 < x   then     {  don’t compute if x <=
IDENTIFIER        t
UNKNOWN                fact  := 1;
RESERVE_WORD        repeat
IDENTIFIER        t
UNKNOWN                   fact  := fact *  x;
IDENTIFIER        x
UNKNOWN                    x  := x  -
RESERVE_WORD        until
UNKNOWN        x  =  0;
COMMENT        {  output  factorial of x }
RESERVE_WORD        write
RESERVE_WORD        end

Upvotes: 0

Views: 369

Answers (1)

Booboo
Booboo

Reputation: 44223

If you are going to parse a language you need a 'lexer' that will return individual tokens ignoring whitespace and comments. Along these lines, just as an example:

import re, collections

class Lexer(object):

    WHITESPACE = r'(?P<WHITESPACE>\s+)'
    COMMENT = r'(?P<COMMENT>{[^}]*})'
    READ = r'(?P<READ>\bread\b)'
    WRITE = r'(?P<WRITE>\bwrite\b)'
    IF = r'(?P<IF>\bif\b)'
    THEN = r'(?P<THEN>\bthen\b)'
    ELSE = r'(?P<ELSE>\belse\b)'
    END = r'(?P<END>\bend\b)'
    REPEAT = r'(?P<REPEAT>\brepeat\b)'
    UNTIL = r'(?P<UNTIL>\buntil\b)'
    OPERATOR = r'(?P<OPERATOR>(?:[+*/=<>-]|:=))'
    LPAREN = r'(?P<LPAREN>\()'
    RPAREN = r'(?P<RPAREN>\))'
    IDENTIFIER = r'(?P<IDENTIFIER>[a-z]+)'
    INTEGER = r'(?P<INTEGER>\d+)'
    SEMICOLON = r'(?P<SEMICOLON>;)'

    regex = re.compile('|'.join([
        WHITESPACE,
        COMMENT,
        READ,
        WRITE,
        IF,
        THEN,
        ELSE,
        END,
        REPEAT,
        UNTIL,
        OPERATOR,
        LPAREN,
        RPAREN,
        IDENTIFIER,
        INTEGER,
        SEMICOLON
        ]))

    def __init__ (self, file):

        def generate_tokens(text):
            Token = collections.namedtuple('Token', ['type','value'])
            scanner = Lexer.regex.finditer(text)
            last_end = 0
            for m in scanner:
                start = m.start()
                end = m.end()
                if start != last_end:
                    # skipped over text to find the next token implies that there was unrecognizable text or an "error token"
                    text = self.text[last_end:start]
                    token = Token('ERROR', text)
                    yield token
                last_end = end
                token = Token(m.lastgroup, m.group())
                if token.type != 'WHITESPACE' and token.type != 'COMMENT':
                    yield token
            yield Token('EOF', '<end-of-file>')


        with open(file, "r") as f:
            text = f.read()
            self._token_generator = generate_tokens(text)

    def next_token(self):
        # if you call this past the "EOF" token you will get a StopIteration exception
        return self._token_generator.__next__()


lexer = Lexer('input.txt')
while True:
    token = lexer.next_token()
    print(token)
    if token.type == 'EOF':
        break

Prints:

Token(type='READ', value='read')
Token(type='IDENTIFIER', value='x')
Token(type='SEMICOLON', value=';')
Token(type='IF', value='if')
Token(type='INTEGER', value='0')
Token(type='OPERATOR', value='<')
Token(type='IDENTIFIER', value='x')
Token(type='THEN', value='then')
Token(type='IDENTIFIER', value='fact')
Token(type='OPERATOR', value=':=')
Token(type='INTEGER', value='1')
Token(type='SEMICOLON', value=';')
Token(type='REPEAT', value='repeat')
Token(type='IDENTIFIER', value='fact')
Token(type='OPERATOR', value=':=')
Token(type='IDENTIFIER', value='fact')
Token(type='OPERATOR', value='*')
Token(type='IDENTIFIER', value='x')
Token(type='SEMICOLON', value=';')
Token(type='IDENTIFIER', value='x')
Token(type='OPERATOR', value=':=')
Token(type='IDENTIFIER', value='x')
Token(type='OPERATOR', value='-')
Token(type='INTEGER', value='1')
Token(type='UNTIL', value='until')
Token(type='IDENTIFIER', value='x')
Token(type='OPERATOR', value='=')
Token(type='INTEGER', value='0')
Token(type='SEMICOLON', value=';')
Token(type='WRITE', value='write')
Token(type='IDENTIFIER', value='fact')
Token(type='END', value='end')
Token(type='EOF', value='<end-of-file>')

Upvotes: 1

Related Questions