diff --git a/tagstudio/src/core/query_lang/ast.py b/tagstudio/src/core/query_lang/ast.py new file mode 100644 index 00000000..ae2aea81 --- /dev/null +++ b/tagstudio/src/core/query_lang/ast.py @@ -0,0 +1,58 @@ +from enum import Enum +from typing import Union + + +class ConstraintType(Enum): + Tag = 0 + MediaType = 1 + + @staticmethod + def from_string(text: str) -> "ConstraintType": + return { + "tag": ConstraintType.Tag, + "mediatype": ConstraintType.MediaType + }.get(text.lower(), None) + +class AST: + def __str__(self): + class_name = self.__class__.__name__ + fields = vars(self) # Get all instance variables as a dictionary + field_str = ", ".join(f"{key}={value}" for key, value in fields.items()) + return f"{class_name}({field_str})" + + def __repr__(self) -> str: + return self.__str__() + +class ANDList(AST): + elements: list["ORList"] + + def __init__(self, elements: list["ORList"]) -> None: + super().__init__() + self.elements = elements + +class ORList(AST): + terms: list[Union[ANDList, "Constraint"]] + + def __init__(self, terms: list[Union[ANDList, "Constraint"]]) -> None: + super().__init__() + self.terms = terms + +class Constraint(AST): + type: ConstraintType + value: str + properties: list["Property"] + + def __init__(self, type: ConstraintType, value: str, properties: list["Property"]) -> None: + super().__init__() + self.type = type + self.value = value + self.properties = properties + +class Property(AST): + key: str + value: str + + def __init__(self, key: str, value: str) -> None: + super().__init__() + self.key = key + self.value = value \ No newline at end of file diff --git a/tagstudio/src/core/query_lang/parser.py b/tagstudio/src/core/query_lang/parser.py new file mode 100644 index 00000000..63525ca2 --- /dev/null +++ b/tagstudio/src/core/query_lang/parser.py @@ -0,0 +1,104 @@ +from src.core.query_lang.ast import AST, ANDList, Constraint, ORList, Property +from src.core.query_lang.tokenizer import ConstraintType, Token, Tokenizer, TokenType +from src.core.query_lang.util import ParsingError + + +class Parser: + text: str + tokenizer: Tokenizer + next_token: Token + + last_constraint_type: ConstraintType = ConstraintType.Tag + + def __init__(self, text: str) -> None: + self.text = text + self.tokenizer = Tokenizer(self.text) + self.next_token = self.tokenizer.get_next_token() + + def parse(self) -> AST: + out = self.__or_list() + if self.next_token.type != TokenType.EOF: + raise ParsingError(self.next_token.start, self.next_token.end, "Syntax Error") + return out + + def __and_list(self) -> ANDList: + elements = [ self.__term() ] + while self.next_token.type != TokenType.EOF and not self.__is_next_or(): + self.__skip_and() + elements.append(self.__term()) + return ANDList(elements) + + def __skip_and(self) -> None: + if self.__is_next_and(): + self.__eat(TokenType.ULITERAL) + + if self.__is_next_and(): + raise self.__syntax_error("Unexpected AND") + + def __is_next_and(self) -> bool: + return self.next_token.type == TokenType.ULITERAL and self.next_token.value.upper() == "AND" + + def __or_list(self) -> ORList: + terms = [ self.__and_list() ] + + while self.__is_next_or(): + self.__eat(TokenType.ULITERAL) + terms.append(self.__and_list()) + + return ORList(terms) + + def __is_next_or(self) -> bool: + return self.next_token.type == TokenType.ULITERAL and self.next_token.value.upper() == "OR" + + def __term(self) -> AST: + if self.next_token.type == TokenType.RBRACKETO: + self.__eat(TokenType.RBRACKETO) + out = self.__and_list() + self.__eat(TokenType.RBRACKETC) + return out + else: + return self.__constraint() + + def __constraint(self) -> Constraint: + if self.next_token.type == TokenType.CONSTRAINTTYPE: + self.last_constraint_type = self.__eat(TokenType.CONSTRAINTTYPE).value + + value = self.__literal() + + properties = [] + if self.next_token.type == TokenType.SBRACKETO: + self.__eat(TokenType.SBRACKETO) + properties.append(self.__property()) + + while self.next_token.type == TokenType.COMMA: + self.__eat(TokenType.COMMA) + properties.append(self.__property()) + + self.__eat(TokenType.SBRACKETC) + + return Constraint(self.last_constraint_type, value, properties) + + def __property(self) -> Property: + key = self.__eat(TokenType.ULITERAL).value + self.__eat(TokenType.EQUALS) + value = self.__literal() + return Property(key, value) + + def __literal(self) -> str: + if self.next_token.type in [TokenType.QLITERAL, TokenType.ULITERAL]: + return self.__eat(self.next_token.type).value + + def __eat(self, type: TokenType) -> Token: + if self.next_token.type != type: + self.__syntax_error(f"expected {type} found {self.next_token.type}") + out = self.next_token + self.next_token = self.tokenizer.get_next_token() + return out + + def __syntax_error(self, msg: str = "Syntax Error") -> None: + raise ParsingError(self.next_token.start, self.next_token.end, msg) + +if __name__ == "__main__": #TODO remove + print("") # noqa: T201 + p = Parser("Mario AND Luigi tag:test[parent=Color,color=red] OR mediatype:test") + print(p.parse()) # noqa: T201 diff --git a/tagstudio/src/core/query_lang/tokenizer.py b/tagstudio/src/core/query_lang/tokenizer.py new file mode 100644 index 00000000..fbc85e7d --- /dev/null +++ b/tagstudio/src/core/query_lang/tokenizer.py @@ -0,0 +1,153 @@ +from enum import Enum + +from src.core.query_lang.ast import ConstraintType +from src.core.query_lang.util import ParsingError + + +class TokenType(Enum): + EOF = -1 + QLITERAL = 0 # Quoted Literal + ULITERAL = 1 # Unquoted Literal (does not contain ":", " ", "[", "]", "(", ")", "=", ",") + RBRACKETO = 2 # Round Bracket Open + RBRACKETC = 3 # Round Bracket Close + SBRACKETO = 4 # Square Bracket Open + SBRACKETC = 5 # Square Bracket Close + CONSTRAINTTYPE = 6 + COLON = 10 + COMMA = 11 + EQUALS = 12 + +class Token: + type: TokenType + value: any + + start: int + end: int + + def __init__(self, type: TokenType, value: any, start: int = None, end: int = None) -> None: + self.type = type + self.value = value + self.start = start + self.end = end + + @staticmethod + def from_type(type: TokenType, pos: int = None) -> TokenType: + return Token(type, None, pos, pos) + + @staticmethod + def EOF() -> "Token": # noqa: N802 + return Token.from_type(TokenType.EOF) + + def __str__(self) -> str: + return f"Token({self.type}, {self.value}, {self.start}, {self.end})" + + def __repr__(self) -> str: + return self.__str__() + +class Tokenizer: + text: str + pos: int + current_char: str + + ESCAPABLE_CHARS = ["\\", '"', '"'] + NOT_IN_ULITERAL = [":", " ", "[", "]", "(", ")", "=", ","] + + def __init__(self, text: str) -> None: + self.text = text + self.pos = 0 + self.current_char = self.text[self.pos] + + def get_next_token(self) -> Token: + self.__skip_whitespace() + if self.current_char is None: + return Token.EOF() + + if self.current_char in ("'", '"'): + return self.__quoted_string() + elif self.current_char == "(": + self.__advance() + return Token.from_type(TokenType.RBRACKETO, self.pos - 1) + elif self.current_char == ")": + self.__advance() + return Token.from_type(TokenType.RBRACKETC, self.pos - 1) + elif self.current_char == "[": + self.__advance() + return Token.from_type(TokenType.SBRACKETO, self.pos - 1) + elif self.current_char == "]": + self.__advance() + return Token.from_type(TokenType.SBRACKETC, self.pos - 1) + elif self.current_char == ",": + self.__advance() + return Token.from_type(TokenType.COMMA, self.pos - 1) + elif self.current_char == "=": + self.__advance() + return Token.from_type(TokenType.EQUALS, self.pos - 1) + else: + return self.__unquoted_string_or_constraint_type() + + def __unquoted_string_or_constraint_type(self) -> Token: + out = "" + + start = self.pos + + while self.current_char not in self.NOT_IN_ULITERAL and self.current_char is not None: + out += self.current_char + self.__advance() + + end = self.pos - 1 + + if self.current_char == ":": + if len(out) == 0: + raise ParsingError(self.pos, self.pos) + self.__advance() + constraint_type = ConstraintType.from_string(out) + if constraint_type is None: + raise ParsingError(start, end, f"Invalid ContraintType \"{out}\"") + return Token(TokenType.CONSTRAINTTYPE, constraint_type, start, end) + else: + return Token(TokenType.ULITERAL, out, start, end) + + def __quoted_string(self) -> Token: + start = self.pos + quote = self.current_char + self.__advance() + escape = False + out = "" + + while escape or self.current_char != quote: + if escape: + escape = False + if self.current_char not in Tokenizer.ESCAPABLE_CHARS: + out += "\\" + else: + out += self.current_char + self.__advance() + continue + if self.current_char == "\\": + escape = True + else: + out += self.current_char + self.__advance() + end = self.pos + self.__advance() + return Token(TokenType.QLITERAL, out, start, end) + + def __advance(self) -> None: + if self.pos < len(self.text) - 1: + self.pos += 1 + self.current_char = self.text[self.pos] + else: + self.current_char = None + + def __skip_whitespace(self) -> None: + if self.current_char is None: + return + while self.current_char.isspace(): + self.__advance() + +if __name__ == "__main__": #TODO remove + t = Tokenizer("Mario AND Luigi tag:test[parent=Color,color=red]") + last = Token(None, None) + while last.type != TokenType.EOF: + last = t.get_next_token() + print(last) # noqa: T201 diff --git a/tagstudio/src/core/query_lang/util.py b/tagstudio/src/core/query_lang/util.py new file mode 100644 index 00000000..93885e89 --- /dev/null +++ b/tagstudio/src/core/query_lang/util.py @@ -0,0 +1,15 @@ +class ParsingError(BaseException): + start: int + end: int + msg: str + + def __init__(self, start: int, end: int, msg: str = "Syntax Error") -> None: + self.start = start + self.end = end + self.msg = msg + + def __str__(self) -> str: + return f"Syntax Error {self.start}->{self.end}: {self.msg}" + + def __repr__(self) -> str: + return self.__str__() \ No newline at end of file