authentik.sources.scim.patch.lexer

  1from dataclasses import dataclass
  2from enum import Enum
  3
  4from authentik.sources.scim.constants import (
  5    SCIM_URN_GROUP,
  6    SCIM_URN_SCHEMA,
  7    SCIM_URN_USER,
  8    SCIM_URN_USER_ENTERPRISE,
  9)
 10
 11
 12# Token types for SCIM path parsing
 13class TokenType(Enum):
 14    ATTRIBUTE = "ATTRIBUTE"
 15    DOT = "DOT"
 16    LBRACKET = "LBRACKET"
 17    RBRACKET = "RBRACKET"
 18    LPAREN = "LPAREN"
 19    RPAREN = "RPAREN"
 20    STRING = "STRING"
 21    NUMBER = "NUMBER"
 22    BOOLEAN = "BOOLEAN"
 23    NULL = "NULL"
 24    OPERATOR = "OPERATOR"
 25    AND = "AND"
 26    OR = "OR"
 27    NOT = "NOT"
 28    EOF = "EOF"
 29
 30
 31@dataclass
 32class Token:
 33    type: TokenType
 34    value: str
 35    position: int = 0
 36
 37
 38class SCIMPathLexer:
 39    """Lexer for SCIM paths and filter expressions"""
 40
 41    OPERATORS = ["eq", "ne", "co", "sw", "ew", "gt", "lt", "ge", "le", "pr"]
 42
 43    def __init__(self, text: str):
 44        self.schema_urns = [
 45            SCIM_URN_SCHEMA,
 46            SCIM_URN_GROUP,
 47            SCIM_URN_USER,
 48            SCIM_URN_USER_ENTERPRISE,
 49        ]
 50        self.text = text
 51        self.pos = 0
 52        self.current_char = self.text[self.pos] if self.pos < len(self.text) else None
 53
 54    def advance(self):
 55        """Move to next character"""
 56        self.pos += 1
 57        self.current_char = self.text[self.pos] if self.pos < len(self.text) else None
 58
 59    def skip_whitespace(self):
 60        """Skip whitespace characters"""
 61        while self.current_char and self.current_char.isspace():
 62            self.advance()
 63
 64    def read_string(self, quote_char):
 65        """Read a quoted string"""
 66        value = ""
 67        self.advance()  # Skip opening quote
 68
 69        while self.current_char and self.current_char != quote_char:
 70            if self.current_char == "\\":
 71                self.advance()
 72                if self.current_char:
 73                    value += self.current_char
 74                    self.advance()
 75            else:
 76                value += self.current_char
 77                self.advance()
 78
 79        if self.current_char == quote_char:
 80            self.advance()  # Skip closing quote
 81
 82        return value
 83
 84    def read_number(self):
 85        """Read a number (integer or float)"""
 86        value = ""
 87        while self.current_char and (self.current_char.isdigit() or self.current_char == "."):
 88            value += self.current_char
 89            self.advance()
 90        return value
 91
 92    def read_identifier(self):
 93        """Read an identifier (attribute name or operator) - supports URN format"""
 94        value = ""
 95        while self.current_char and (self.current_char.isalnum() or self.current_char in "_-:"):
 96            value += self.current_char
 97            self.advance()
 98            # If the identifier value so far is a schema URN, take that as the identifier and
 99            # treat the next part as a sub_attribute
100            if value in self.schema_urns:
101                self.current_char = "."
102                return value
103
104            # Handle dots within URN identifiers (like "2.0")
105            # A dot is part of the identifier if it's followed by a digit
106            if (
107                self.current_char == "."
108                and self.pos + 1 < len(self.text)
109                and self.text[self.pos + 1].isdigit()
110            ):
111                value += self.current_char
112                self.advance()
113                # Continue reading digits after the dot
114                while self.current_char and self.current_char.isdigit():
115                    value += self.current_char
116                    self.advance()
117
118        return value
119
120    def get_next_token(self) -> Token:  # noqa PLR0911
121        """Get the next token from the input"""
122        while self.current_char:
123            if self.current_char.isspace():
124                self.skip_whitespace()
125                continue
126
127            if self.current_char == ".":
128                self.advance()
129                return Token(TokenType.DOT, ".")
130
131            if self.current_char == "[":
132                self.advance()
133                return Token(TokenType.LBRACKET, "[")
134
135            if self.current_char == "]":
136                self.advance()
137                return Token(TokenType.RBRACKET, "]")
138
139            if self.current_char == "(":
140                self.advance()
141                return Token(TokenType.LPAREN, "(")
142
143            if self.current_char == ")":
144                self.advance()
145                return Token(TokenType.RPAREN, ")")
146
147            if self.current_char in "\"'":
148                quote_char = self.current_char
149                value = self.read_string(quote_char)
150                return Token(TokenType.STRING, value)
151
152            if self.current_char.isdigit():
153                value = self.read_number()
154                return Token(TokenType.NUMBER, value)
155
156            if self.current_char.isalpha() or self.current_char == "_":
157                value = self.read_identifier()
158
159                # Check for special keywords
160                if value.lower() == "true":
161                    return Token(TokenType.BOOLEAN, True)
162                elif value.lower() == "false":
163                    return Token(TokenType.BOOLEAN, False)
164                elif value.lower() == "null":
165                    return Token(TokenType.NULL, None)
166                elif value.lower() == "and":
167                    return Token(TokenType.AND, "and")
168                elif value.lower() == "or":
169                    return Token(TokenType.OR, "or")
170                elif value.lower() == "not":
171                    return Token(TokenType.NOT, "not")
172                elif value.lower() in self.OPERATORS:
173                    return Token(TokenType.OPERATOR, value.lower())
174                else:
175                    return Token(TokenType.ATTRIBUTE, value)
176
177            # Skip unknown characters
178            self.advance()
179
180        return Token(TokenType.EOF, "")
class TokenType(enum.Enum):
14class TokenType(Enum):
15    ATTRIBUTE = "ATTRIBUTE"
16    DOT = "DOT"
17    LBRACKET = "LBRACKET"
18    RBRACKET = "RBRACKET"
19    LPAREN = "LPAREN"
20    RPAREN = "RPAREN"
21    STRING = "STRING"
22    NUMBER = "NUMBER"
23    BOOLEAN = "BOOLEAN"
24    NULL = "NULL"
25    OPERATOR = "OPERATOR"
26    AND = "AND"
27    OR = "OR"
28    NOT = "NOT"
29    EOF = "EOF"
ATTRIBUTE = <TokenType.ATTRIBUTE: 'ATTRIBUTE'>
DOT = <TokenType.DOT: 'DOT'>
LBRACKET = <TokenType.LBRACKET: 'LBRACKET'>
RBRACKET = <TokenType.RBRACKET: 'RBRACKET'>
LPAREN = <TokenType.LPAREN: 'LPAREN'>
RPAREN = <TokenType.RPAREN: 'RPAREN'>
STRING = <TokenType.STRING: 'STRING'>
NUMBER = <TokenType.NUMBER: 'NUMBER'>
BOOLEAN = <TokenType.BOOLEAN: 'BOOLEAN'>
NULL = <TokenType.NULL: 'NULL'>
OPERATOR = <TokenType.OPERATOR: 'OPERATOR'>
AND = <TokenType.AND: 'AND'>
OR = <TokenType.OR: 'OR'>
NOT = <TokenType.NOT: 'NOT'>
EOF = <TokenType.EOF: 'EOF'>
@dataclass
class Token:
32@dataclass
33class Token:
34    type: TokenType
35    value: str
36    position: int = 0
Token( type: TokenType, value: str, position: int = 0)
type: TokenType
value: str
position: int = 0
class SCIMPathLexer:
 39class SCIMPathLexer:
 40    """Lexer for SCIM paths and filter expressions"""
 41
 42    OPERATORS = ["eq", "ne", "co", "sw", "ew", "gt", "lt", "ge", "le", "pr"]
 43
 44    def __init__(self, text: str):
 45        self.schema_urns = [
 46            SCIM_URN_SCHEMA,
 47            SCIM_URN_GROUP,
 48            SCIM_URN_USER,
 49            SCIM_URN_USER_ENTERPRISE,
 50        ]
 51        self.text = text
 52        self.pos = 0
 53        self.current_char = self.text[self.pos] if self.pos < len(self.text) else None
 54
 55    def advance(self):
 56        """Move to next character"""
 57        self.pos += 1
 58        self.current_char = self.text[self.pos] if self.pos < len(self.text) else None
 59
 60    def skip_whitespace(self):
 61        """Skip whitespace characters"""
 62        while self.current_char and self.current_char.isspace():
 63            self.advance()
 64
 65    def read_string(self, quote_char):
 66        """Read a quoted string"""
 67        value = ""
 68        self.advance()  # Skip opening quote
 69
 70        while self.current_char and self.current_char != quote_char:
 71            if self.current_char == "\\":
 72                self.advance()
 73                if self.current_char:
 74                    value += self.current_char
 75                    self.advance()
 76            else:
 77                value += self.current_char
 78                self.advance()
 79
 80        if self.current_char == quote_char:
 81            self.advance()  # Skip closing quote
 82
 83        return value
 84
 85    def read_number(self):
 86        """Read a number (integer or float)"""
 87        value = ""
 88        while self.current_char and (self.current_char.isdigit() or self.current_char == "."):
 89            value += self.current_char
 90            self.advance()
 91        return value
 92
 93    def read_identifier(self):
 94        """Read an identifier (attribute name or operator) - supports URN format"""
 95        value = ""
 96        while self.current_char and (self.current_char.isalnum() or self.current_char in "_-:"):
 97            value += self.current_char
 98            self.advance()
 99            # If the identifier value so far is a schema URN, take that as the identifier and
100            # treat the next part as a sub_attribute
101            if value in self.schema_urns:
102                self.current_char = "."
103                return value
104
105            # Handle dots within URN identifiers (like "2.0")
106            # A dot is part of the identifier if it's followed by a digit
107            if (
108                self.current_char == "."
109                and self.pos + 1 < len(self.text)
110                and self.text[self.pos + 1].isdigit()
111            ):
112                value += self.current_char
113                self.advance()
114                # Continue reading digits after the dot
115                while self.current_char and self.current_char.isdigit():
116                    value += self.current_char
117                    self.advance()
118
119        return value
120
121    def get_next_token(self) -> Token:  # noqa PLR0911
122        """Get the next token from the input"""
123        while self.current_char:
124            if self.current_char.isspace():
125                self.skip_whitespace()
126                continue
127
128            if self.current_char == ".":
129                self.advance()
130                return Token(TokenType.DOT, ".")
131
132            if self.current_char == "[":
133                self.advance()
134                return Token(TokenType.LBRACKET, "[")
135
136            if self.current_char == "]":
137                self.advance()
138                return Token(TokenType.RBRACKET, "]")
139
140            if self.current_char == "(":
141                self.advance()
142                return Token(TokenType.LPAREN, "(")
143
144            if self.current_char == ")":
145                self.advance()
146                return Token(TokenType.RPAREN, ")")
147
148            if self.current_char in "\"'":
149                quote_char = self.current_char
150                value = self.read_string(quote_char)
151                return Token(TokenType.STRING, value)
152
153            if self.current_char.isdigit():
154                value = self.read_number()
155                return Token(TokenType.NUMBER, value)
156
157            if self.current_char.isalpha() or self.current_char == "_":
158                value = self.read_identifier()
159
160                # Check for special keywords
161                if value.lower() == "true":
162                    return Token(TokenType.BOOLEAN, True)
163                elif value.lower() == "false":
164                    return Token(TokenType.BOOLEAN, False)
165                elif value.lower() == "null":
166                    return Token(TokenType.NULL, None)
167                elif value.lower() == "and":
168                    return Token(TokenType.AND, "and")
169                elif value.lower() == "or":
170                    return Token(TokenType.OR, "or")
171                elif value.lower() == "not":
172                    return Token(TokenType.NOT, "not")
173                elif value.lower() in self.OPERATORS:
174                    return Token(TokenType.OPERATOR, value.lower())
175                else:
176                    return Token(TokenType.ATTRIBUTE, value)
177
178            # Skip unknown characters
179            self.advance()
180
181        return Token(TokenType.EOF, "")

Lexer for SCIM paths and filter expressions

SCIMPathLexer(text: str)
44    def __init__(self, text: str):
45        self.schema_urns = [
46            SCIM_URN_SCHEMA,
47            SCIM_URN_GROUP,
48            SCIM_URN_USER,
49            SCIM_URN_USER_ENTERPRISE,
50        ]
51        self.text = text
52        self.pos = 0
53        self.current_char = self.text[self.pos] if self.pos < len(self.text) else None
OPERATORS = ['eq', 'ne', 'co', 'sw', 'ew', 'gt', 'lt', 'ge', 'le', 'pr']
schema_urns
text
pos
current_char
def advance(self):
55    def advance(self):
56        """Move to next character"""
57        self.pos += 1
58        self.current_char = self.text[self.pos] if self.pos < len(self.text) else None

Move to next character

def skip_whitespace(self):
60    def skip_whitespace(self):
61        """Skip whitespace characters"""
62        while self.current_char and self.current_char.isspace():
63            self.advance()

Skip whitespace characters

def read_string(self, quote_char):
65    def read_string(self, quote_char):
66        """Read a quoted string"""
67        value = ""
68        self.advance()  # Skip opening quote
69
70        while self.current_char and self.current_char != quote_char:
71            if self.current_char == "\\":
72                self.advance()
73                if self.current_char:
74                    value += self.current_char
75                    self.advance()
76            else:
77                value += self.current_char
78                self.advance()
79
80        if self.current_char == quote_char:
81            self.advance()  # Skip closing quote
82
83        return value

Read a quoted string

def read_number(self):
85    def read_number(self):
86        """Read a number (integer or float)"""
87        value = ""
88        while self.current_char and (self.current_char.isdigit() or self.current_char == "."):
89            value += self.current_char
90            self.advance()
91        return value

Read a number (integer or float)

def read_identifier(self):
 93    def read_identifier(self):
 94        """Read an identifier (attribute name or operator) - supports URN format"""
 95        value = ""
 96        while self.current_char and (self.current_char.isalnum() or self.current_char in "_-:"):
 97            value += self.current_char
 98            self.advance()
 99            # If the identifier value so far is a schema URN, take that as the identifier and
100            # treat the next part as a sub_attribute
101            if value in self.schema_urns:
102                self.current_char = "."
103                return value
104
105            # Handle dots within URN identifiers (like "2.0")
106            # A dot is part of the identifier if it's followed by a digit
107            if (
108                self.current_char == "."
109                and self.pos + 1 < len(self.text)
110                and self.text[self.pos + 1].isdigit()
111            ):
112                value += self.current_char
113                self.advance()
114                # Continue reading digits after the dot
115                while self.current_char and self.current_char.isdigit():
116                    value += self.current_char
117                    self.advance()
118
119        return value

Read an identifier (attribute name or operator) - supports URN format

def get_next_token(self) -> Token:
121    def get_next_token(self) -> Token:  # noqa PLR0911
122        """Get the next token from the input"""
123        while self.current_char:
124            if self.current_char.isspace():
125                self.skip_whitespace()
126                continue
127
128            if self.current_char == ".":
129                self.advance()
130                return Token(TokenType.DOT, ".")
131
132            if self.current_char == "[":
133                self.advance()
134                return Token(TokenType.LBRACKET, "[")
135
136            if self.current_char == "]":
137                self.advance()
138                return Token(TokenType.RBRACKET, "]")
139
140            if self.current_char == "(":
141                self.advance()
142                return Token(TokenType.LPAREN, "(")
143
144            if self.current_char == ")":
145                self.advance()
146                return Token(TokenType.RPAREN, ")")
147
148            if self.current_char in "\"'":
149                quote_char = self.current_char
150                value = self.read_string(quote_char)
151                return Token(TokenType.STRING, value)
152
153            if self.current_char.isdigit():
154                value = self.read_number()
155                return Token(TokenType.NUMBER, value)
156
157            if self.current_char.isalpha() or self.current_char == "_":
158                value = self.read_identifier()
159
160                # Check for special keywords
161                if value.lower() == "true":
162                    return Token(TokenType.BOOLEAN, True)
163                elif value.lower() == "false":
164                    return Token(TokenType.BOOLEAN, False)
165                elif value.lower() == "null":
166                    return Token(TokenType.NULL, None)
167                elif value.lower() == "and":
168                    return Token(TokenType.AND, "and")
169                elif value.lower() == "or":
170                    return Token(TokenType.OR, "or")
171                elif value.lower() == "not":
172                    return Token(TokenType.NOT, "not")
173                elif value.lower() in self.OPERATORS:
174                    return Token(TokenType.OPERATOR, value.lower())
175                else:
176                    return Token(TokenType.ATTRIBUTE, value)
177
178            # Skip unknown characters
179            self.advance()
180
181        return Token(TokenType.EOF, "")

Get the next token from the input