I created a syntax highlighter to make my codes looking colorful in my blog. It is written in Python, so it does not require a Javascript code when the page is loaded. It is useful when you create html pages for documentation or a blog. Instead of making it packaged, I share the code piece, so it will be easer to modify when you find something missing.
reserved = [ "if", "elif", "else", "while", "for", "def", "return", "global", "import", "from", "continue", "break", "del", "in", "not", "or", "and", "is", "raise", "class", ] whitespace = [" ", "\n"] quotes = ["'", '"'] nums = "0123456789" abc = "abcdefghijklmnoprstuvyzwxq" abc += abc.upper() + "_" operators = ["=", "<", ">", "+", "-", "=", "/", "*", "?", "<=", ">=", "==", "!", "!=", "%"] named_tokens = { ",": "comma", ".": "dot", "[": "bracked_begin", "]": "bracket_end", "(": "paran_begin", ")": "paran_end", "{": "curly_begin", "}": "curly_end", ";": "semicolon", ":": "colon", } quote_escape = chr(92) reserved_identifiers = ["None", "True", "False"] def seek_new_token(): i = tokenizer["i"] if tokenizer["char"] in quotes: tokenizer["current_quote"] = tokenizer["char"] tokenizer["token"] = {"type": "string", "starts": i, "ends": i+1} tokenizer["tokens"].append(tokenizer["token"]) elif tokenizer["char"] in operators: tokenizer["token"] = {"type": "operator", "starts": i, "ends": i+1} tokenizer["tokens"].append(tokenizer["token"]) elif tokenizer["char"] in nums: tokenizer["token"] = {"type": "number", "starts": i, "ends": i+1} tokenizer["tokens"].append(tokenizer["token"]) elif tokenizer["char"] in abc: tokenizer["token"] = {"type": "identifier", "starts": i, "ends": i+1} tokenizer["tokens"].append(tokenizer["token"]) elif tokenizer["char"] in named_tokens: tokenizer["tokens"].append({"type": named_tokens[tokenizer["char"]], "starts": i, "ends": i+1}) elif tokenizer["char"] == "#": tokenizer["token"] = {"type": "comment", "starts": i, "ends": i+1} tokenizer["tokens"].append(tokenizer["token"]) elif tokenizer["char"] in whitespace: tokenizer["token"] = {"type": "whitespace", "starts": i, "ends": i+1} tokenizer["tokens"].append(tokenizer["token"]) tokenizer = { "i": 0, "current_quote": None, "token": None, "tokens": [], "char": None } def reset(): tokenizer["i"] = 0 tokenizer["current_quote"] = None tokenizer["token"] = None tokenizer["tokens"] = [] tokenizer["char"] = None def tokenize(text): reset() while tokenizer["i"] < len(text): tokenizer["char"] = text[tokenizer["i"]] if tokenizer["token"]: token_next = text[tokenizer["token"]["starts"]: tokenizer["i"]+1] if tokenizer["token"]["type"] == "string": tokenizer["token"]["ends"] += 1 if tokenizer["char"] == tokenizer["current_quote"] and text[tokenizer["i"]-1] != quote_escape: tokenizer["token"] = None tokenizer["current_quote"] = None elif tokenizer["token"]["type"] == "operator": if token_next in operators: tokenizer["token"]["ends"] += 1 else: tokenizer["token"] = None seek_new_token() elif tokenizer["token"]["type"] == "number": if tokenizer["char"] in nums: tokenizer["token"]["ends"] += 1 else: tokenizer["token"] = None seek_new_token() elif tokenizer["token"]["type"] in ("identifier", "reserved", "reserved-identifier"): if tokenizer["char"] in abc or tokenizer["char"] in nums: tokenizer["token"]["ends"] += 1 if token_next in reserved_identifiers: tokenizer["token"]["type"] = "reserved-identifier" elif token_next in reserved: tokenizer["token"]["type"] = "reserved" else: tokenizer["token"]["type"] = "identifier" else: tokenizer["token"] = None seek_new_token() elif tokenizer["token"]["type"] == "comment": if tokenizer["char"] == "\n": tokenizer["token"] = None seek_new_token() else: tokenizer["token"]["ends"] += 1 elif tokenizer["token"]["type"] == "whitespace": if tokenizer["char"] in whitespace: tokenizer["token"]["ends"] += 1 else: tokenizer["token"] = None seek_new_token() else: seek_new_token() tokenizer["i"] += 1 return tokenizer["tokens"]
By seeking all the characters with index, for an index of an iterated character, the function like below will be helpful to understand how the current character is categorized.
def seek_token_by_char_index(tokens, char_index): for token in tokens: if char_index >= token["starts"] and char_index < token["ends"]: return token
Good luck playing with that.