add files

2026-07-17 11:06:20 +02:00 · 2024-11-27 20:42:57 +01:00
parent 20f93719d7
commit 7981d13274
4 changed files with 330 additions and 0 deletions
@@ -0,0 +1,58 @@
+from enum import Enum
+from typing import Union
+
+
+class ConstraintType(Enum):
+    Tag = 0
+    MediaType = 1
+
+    @staticmethod
+    def from_string(text: str) -> "ConstraintType":
+        return {
+            "tag": ConstraintType.Tag,
+            "mediatype": ConstraintType.MediaType
+        }.get(text.lower(), None)
+
+class AST:
+    def __str__(self):
+        class_name = self.__class__.__name__
+        fields = vars(self)  # Get all instance variables as a dictionary
+        field_str = ", ".join(f"{key}={value}" for key, value in fields.items())
+        return f"{class_name}({field_str})"
+    
+    def __repr__(self) -> str:
+        return self.__str__()
+
+class ANDList(AST):
+    elements: list["ORList"]
+
+    def __init__(self, elements: list["ORList"]) -> None:
+        super().__init__()
+        self.elements = elements
+
+class ORList(AST):
+    terms: list[Union[ANDList, "Constraint"]]
+
+    def __init__(self, terms: list[Union[ANDList, "Constraint"]]) -> None:
+        super().__init__()
+        self.terms = terms
+
+class Constraint(AST):
+    type: ConstraintType
+    value: str
+    properties: list["Property"]
+
+    def __init__(self, type: ConstraintType, value: str, properties: list["Property"]) -> None:
+        super().__init__()
+        self.type = type
+        self.value = value
+        self.properties = properties
+
+class Property(AST):
+    key: str
+    value: str
+
+    def __init__(self, key: str, value: str) -> None:
+        super().__init__()
+        self.key = key
+        self.value = value
@@ -0,0 +1,104 @@
+from src.core.query_lang.ast import AST, ANDList, Constraint, ORList, Property
+from src.core.query_lang.tokenizer import ConstraintType, Token, Tokenizer, TokenType
+from src.core.query_lang.util import ParsingError
+
+
+class Parser:
+    text: str
+    tokenizer: Tokenizer
+    next_token: Token
+
+    last_constraint_type: ConstraintType = ConstraintType.Tag
+
+    def __init__(self, text: str) -> None:
+        self.text = text
+        self.tokenizer = Tokenizer(self.text)
+        self.next_token = self.tokenizer.get_next_token()
+    
+    def parse(self) -> AST:
+        out = self.__or_list()
+        if self.next_token.type != TokenType.EOF:
+            raise ParsingError(self.next_token.start, self.next_token.end, "Syntax Error")
+        return out
+
+    def __and_list(self) -> ANDList:
+        elements = [ self.__term() ]
+        while self.next_token.type != TokenType.EOF and not self.__is_next_or():
+            self.__skip_and()
+            elements.append(self.__term())
+        return ANDList(elements)
+    
+    def __skip_and(self) -> None:
+        if self.__is_next_and():
+            self.__eat(TokenType.ULITERAL)
+
+            if self.__is_next_and():
+                raise self.__syntax_error("Unexpected AND")
+    
+    def __is_next_and(self) -> bool:
+        return self.next_token.type == TokenType.ULITERAL and self.next_token.value.upper() == "AND"
+    
+    def __or_list(self) -> ORList:
+        terms = [ self.__and_list() ]
+
+        while self.__is_next_or():
+            self.__eat(TokenType.ULITERAL)
+            terms.append(self.__and_list())
+
+        return ORList(terms)
+
+    def __is_next_or(self) -> bool:
+        return self.next_token.type == TokenType.ULITERAL and self.next_token.value.upper() == "OR"
+    
+    def __term(self) -> AST:
+        if self.next_token.type == TokenType.RBRACKETO:
+            self.__eat(TokenType.RBRACKETO)
+            out = self.__and_list()
+            self.__eat(TokenType.RBRACKETC)
+            return out
+        else:
+            return self.__constraint()
+    
+    def __constraint(self) -> Constraint:
+        if self.next_token.type == TokenType.CONSTRAINTTYPE:
+            self.last_constraint_type = self.__eat(TokenType.CONSTRAINTTYPE).value
+        
+        value = self.__literal()
+
+        properties = []
+        if self.next_token.type == TokenType.SBRACKETO:
+            self.__eat(TokenType.SBRACKETO)
+            properties.append(self.__property())
+            
+            while self.next_token.type == TokenType.COMMA:
+                self.__eat(TokenType.COMMA)
+                properties.append(self.__property())
+
+            self.__eat(TokenType.SBRACKETC)
+
+        return Constraint(self.last_constraint_type, value, properties)
+    
+    def __property(self) -> Property:
+        key = self.__eat(TokenType.ULITERAL).value
+        self.__eat(TokenType.EQUALS)
+        value = self.__literal()
+        return Property(key, value)
+    
+    def __literal(self) -> str:
+        if self.next_token.type in [TokenType.QLITERAL, TokenType.ULITERAL]:
+            return self.__eat(self.next_token.type).value
+
+    def __eat(self, type: TokenType) -> Token:
+        if self.next_token.type != type:
+            self.__syntax_error(f"expected {type} found {self.next_token.type}")
+        out = self.next_token
+        self.next_token = self.tokenizer.get_next_token()
+        return out
+
+    def __syntax_error(self, msg: str = "Syntax Error") -> None:
+        raise ParsingError(self.next_token.start, self.next_token.end, msg)
+
+if __name__ == "__main__": #TODO remove
+    print("")  # noqa: T201
+    p = Parser("Mario AND Luigi tag:test[parent=Color,color=red] OR mediatype:test")
+    print(p.parse())  # noqa: T201
@@ -0,0 +1,153 @@
+from enum import Enum
+
+from src.core.query_lang.ast import ConstraintType
+from src.core.query_lang.util import ParsingError
+
+
+class TokenType(Enum):
+    EOF       = -1
+    QLITERAL  = 0 # Quoted Literal
+    ULITERAL  = 1 # Unquoted Literal (does not contain ":", " ", "[", "]", "(", ")", "=", ",")
+    RBRACKETO = 2 # Round Bracket Open
+    RBRACKETC = 3 # Round Bracket Close
+    SBRACKETO = 4 # Square Bracket Open
+    SBRACKETC = 5 # Square Bracket Close
+    CONSTRAINTTYPE = 6
+    COLON  = 10
+    COMMA  = 11
+    EQUALS = 12
+
+class Token:
+    type: TokenType
+    value: any
+
+    start: int
+    end: int
+
+    def __init__(self, type: TokenType, value: any, start: int = None, end: int = None) -> None:
+        self.type = type
+        self.value = value
+        self.start = start
+        self.end = end
+    
+    @staticmethod
+    def from_type(type: TokenType, pos: int = None) -> TokenType:
+        return Token(type, None, pos, pos)
+
+    @staticmethod
+    def EOF() -> "Token":  # noqa: N802
+        return Token.from_type(TokenType.EOF)
+    
+    def __str__(self) -> str:
+        return f"Token({self.type}, {self.value}, {self.start}, {self.end})"
+    
+    def __repr__(self) -> str:
+        return self.__str__()
+
+class Tokenizer:
+    text: str
+    pos: int
+    current_char: str
+
+    ESCAPABLE_CHARS = ["\\", '"', '"']
+    NOT_IN_ULITERAL = [":", " ", "[", "]", "(", ")", "=", ","]
+
+    def __init__(self, text: str) -> None:
+        self.text = text
+        self.pos = 0
+        self.current_char = self.text[self.pos]
+
+    def get_next_token(self) -> Token:
+        self.__skip_whitespace()
+        if self.current_char is None:
+            return Token.EOF()
+
+        if self.current_char in ("'", '"'):
+            return self.__quoted_string()
+        elif self.current_char == "(":
+            self.__advance()
+            return Token.from_type(TokenType.RBRACKETO, self.pos - 1)
+        elif self.current_char == ")":
+            self.__advance()
+            return Token.from_type(TokenType.RBRACKETC, self.pos - 1)
+        elif self.current_char == "[":
+            self.__advance()
+            return Token.from_type(TokenType.SBRACKETO, self.pos - 1)
+        elif self.current_char == "]":
+            self.__advance()
+            return Token.from_type(TokenType.SBRACKETC, self.pos - 1)
+        elif self.current_char == ",":
+            self.__advance()
+            return Token.from_type(TokenType.COMMA, self.pos - 1)
+        elif self.current_char == "=":
+            self.__advance()
+            return Token.from_type(TokenType.EQUALS, self.pos - 1)
+        else:
+            return self.__unquoted_string_or_constraint_type()
+    
+    def __unquoted_string_or_constraint_type(self) -> Token:
+        out = ""
+
+        start = self.pos
+
+        while self.current_char not in self.NOT_IN_ULITERAL and self.current_char is not None:
+            out += self.current_char
+            self.__advance()
+
+        end = self.pos - 1
+        
+        if self.current_char == ":":
+            if len(out) == 0:
+                raise ParsingError(self.pos, self.pos)
+            self.__advance()
+            constraint_type = ConstraintType.from_string(out)
+            if constraint_type is None:
+                raise ParsingError(start, end, f"Invalid ContraintType \"{out}\"")
+            return Token(TokenType.CONSTRAINTTYPE, constraint_type, start, end)
+        else:
+            return Token(TokenType.ULITERAL, out, start, end)
+
+    def __quoted_string(self) -> Token:
+        start = self.pos
+        quote = self.current_char
+        self.__advance()
+        escape = False
+        out = ""
+
+        while escape or self.current_char != quote:
+            if escape:
+                escape = False
+                if self.current_char not in Tokenizer.ESCAPABLE_CHARS:
+                    out += "\\"
+                else:
+                    out += self.current_char
+                    self.__advance()
+                    continue
+            if self.current_char == "\\":
+                escape = True
+            else:
+                out += self.current_char
+            self.__advance()
+        end = self.pos
+        self.__advance()
+        return Token(TokenType.QLITERAL, out, start, end)
+
+    def __advance(self) -> None:
+        if self.pos < len(self.text) - 1:
+            self.pos += 1
+            self.current_char = self.text[self.pos]
+        else:
+            self.current_char = None
+    
+    def __skip_whitespace(self) -> None:
+        if self.current_char is None:
+            return
+        while self.current_char.isspace():
+            self.__advance()
+
+if __name__ == "__main__": #TODO remove
+    t = Tokenizer("Mario AND Luigi tag:test[parent=Color,color=red]")
+    last = Token(None, None)
+    while last.type != TokenType.EOF:
+        last = t.get_next_token()
+        print(last)  # noqa: T201
@@ -0,0 +1,15 @@
+class ParsingError(BaseException):
+    start: int
+    end: int
+    msg: str
+
+    def __init__(self, start: int, end: int, msg: str = "Syntax Error") -> None:
+        self.start = start
+        self.end = end
+        self.msg = msg
+    
+    def __str__(self) -> str:
+        return f"Syntax Error {self.start}->{self.end}: {self.msg}"
+
+    def __repr__(self) -> str:
+        return self.__str__()