from ply.lex import TOKEN


comment   = r'\/\*[^*]*\*+(?:[^/][^*]*\*+)*\/'

@TOKEN(comment)
def t_COMMENT(t):
    t.lexer.lineno += t.value.count('\n')
    return t

def t_LINE_COMMENT(t):
    r'//.*'
    return t


hex_digit_re            = r'[0-9a-fA-F]'

"""
7.8.4 String Literals
"""

line_terminator_sequence_re = r'(\r\n|\n|\r)'
single_escape_character_re  = r'[\'\"\\bfnrtv]'
non_escape_character_re     = r'[^\'\"\\bfnrtv0-9xu\n\r]'

hex_escape_sequence_re      = r'x{HexDigit}{HexDigit}'\
                                    .format(HexDigit=hex_digit_re)

unicode_escape_sequence_re  = r'u{HexDigit}{HexDigit}{HexDigit}{HexDigit}'\
                                    .format(HexDigit=hex_digit_re)

character_escape_sequence_re = r'({SingleEscapeCharacter}|{NonEscapeCharacter})'\
                                    .format(SingleEscapeCharacter=single_escape_character_re,
                                            NonEscapeCharacter=non_escape_character_re)

escape_sequence_re          = r'({CharacterEscapeSequence}|{HexEscapeSequence}|{UnicodeEscapeSequence})'\
                                    .format(CharacterEscapeSequence=character_escape_sequence_re,
                                            HexEscapeSequence=hex_escape_sequence_re,
                                            UnicodeEscapeSequence=unicode_escape_sequence_re)

line_continuation_re        = r'\\{LineTerminatorSequence}'\
                                    .format(LineTerminatorSequence=line_terminator_sequence_re)

double_string_characters_re = r'([^\"\\\n\r]+)|(\\{EscapeSequence})|{LineContinuation}'\
                                     .format(LineContinuation=line_continuation_re,
                                             EscapeSequence=escape_sequence_re)

single_string_characters_re = r'([^\'\\\n\r]+)|(\\{EscapeSequence})|{LineContinuation}'\
                                     .format(LineContinuation=line_continuation_re,
                                             EscapeSequence=escape_sequence_re)

backtick_string_characters_re = r'([^`]+)|(\\{EscapeSequence})|{LineContinuation}'\
                                     .format(LineContinuation=line_continuation_re,
                                             EscapeSequence=escape_sequence_re)

string_literal_re           = r'\"({DoubleStringCharacters})*\"|\'({SingleStringCharacters})*\'|`({BackTickStringCharacters})*`'\
                                    .format(SingleStringCharacters=single_string_characters_re,
                                            DoubleStringCharacters=double_string_characters_re,
                                            BackTickStringCharacters=backtick_string_characters_re)

@TOKEN(string_literal_re)
def t_STRING_literal(t):
    t.type = 'STRING_LITERAL'
    #t.value = t.value[1:-1] # remove the encasing quotes
    return t

identifer = (
    'IDENT',
)

#
# The following tokens are ECMAScript keywords and may not be used as Identifiers in ECMAScript programs.
#
keywords = (
    'BREAK',            # break
    'CASE',             # case
    'CATCH',            # catch
    'CONTINUE',         # continue
    'DEBUGGER',         # debugger
    'DEFAULT',          # default
    'DELETE',           # delete
    'DO',               # do
    'ELSE',             # else
    'FINALLY',          # finally
    'FOR',              # for
    'FUNCTION',         # function
    'IF',               # if
    'IN',               # in
    'INSTANCEOF',       # instanceof
    'TYPEOF',           # typeof
    'NEW',              # new
    'RETURN',           # return
    'VAR',              # var
    'VOID',             # void
    'SWITCH',           # switch
    'WHILE',            # while
    'THIS',             # this
    'WITH',             # with
    'THROW',            # throw
    'TRY'               # try
)

#
# The following words are used as keywords in proposed extensions and are therefore reserved to allow for
# the possibility of future adoption of those extensions.
#
future_keywords = (
    'CLASS',            # class
    'ENUM',             # enum
    'EXTENDS',          # extends
    'SUPER',            # super
    'CONST',            # const
    'EXPORT',           # export
    'IMPORT'            # import
)

#
# The following tokens are also considered to be FutureReservedWords when they occur within strict mode code.
#
future_strict_keywords = (
    'IMPLEMENTS',       # implements
    'LET',              # let
    'PRIVATE',          # private
    'YIELD',            # yield
    'PUBLIC',           # public
    'INTERFACE',        # interface
    'PACKAGE',          # package
    'PROTECTED',        # protected
    'STATIC'            # static
)

#
#   Literals (7.8)
#
literal_keywords = (
    'NULL_LITERAL',     # null
    'BOOLEAN_LITERAL',  # true / false
    'NUMBER_LITERAL',
    'STRING_LITERAL',
    'REGEX_LITERAL'
)

"""
7.6 Identifier Names and Identifiers
"""

unicode_letter_re                = r'[a-zA-Z]'  # TODO: extend with all values for unicode letter categories
unicode_digit_re                 = r'[0-9]'     # TODO: extend with all values for unicode digit
unicode_connector_punctuation_re = r'[-_]'      # TODO: extend with all values for unicode connectors

unicode_escape_sequence_re       = r'u{HexDigit}{HexDigit}{HexDigit}{HexDigit}' \
                                        .format(HexDigit=hex_digit_re)

identifier_start_re              = r'{UnicodeLetter}|[$_]|(\\{UnicodeEscapeSequence})' \
                                        .format(UnicodeLetter=unicode_letter_re,
                                                UnicodeEscapeSequence=unicode_escape_sequence_re)

identifier_part_re               = r'{IdentifierStart}|{UnicodeDigit}|{UnicodeConnectorPunctuation}' \
                                        .format(IdentifierStart=identifier_start_re,
                                                UnicodeDigit=unicode_digit_re, \
                                                UnicodeConnectorPunctuation=unicode_connector_punctuation_re)

identifier_name                  = r'({IdentifierStart})({IdentifierPart})*' \
                                        .format(IdentifierStart=identifier_start_re,
                                                IdentifierPart=identifier_part_re)

@TOKEN(identifier_name)
def t_IDENT(t):
    v = t.value.upper()
    if v in keywords:
        t.type = t.value.upper()

    if v in future_keywords:
        t.type = t.value.upper()

    if v in future_strict_keywords:
        t.type = t.value.upper()

    if v == 'TRUE':
        t.type = 'BOOLEAN_LITERAL'
    elif v == 'FALSE':
        t.type = 'BOOLEAN_LITERAL'
    elif v == 'NULL':
        t.type = 'NULL_LITERAL'

    return t

"""
7.8.3 Numeric Literals
"""

hex_integer_literal_re  = '0[x|X]{HexDigit}+'.format(HexDigit=hex_digit_re)

@TOKEN(hex_integer_literal_re)
def t_NUMBER_hex_integer_literal(t):
    t.type = 'NUMBER_LITERAL'
    mv = 0 # mathematical value
    for hex_digit in t.value[2:]:
        mv = mv * 16
        if hex_digit.isdigit():
            mv += int(hex_digit)
        elif hex_digit.upper() == 'A':
            mv += 10
        elif hex_digit.upper() == 'B':
            mv += 11
        elif hex_digit.upper() == 'C':
            mv += 12
        elif hex_digit.upper() == 'D':
            mv += 13
        elif hex_digit.upper() == 'E':
            mv += 14
        elif hex_digit.upper() == 'F':
            mv += 15

    #t.value = mv
    return t

decimal_digit_re            = r'[0-9]'
decimal_digits_re           = r'[0-9]+'
non_zero_digit_re           = r'[1-9]'
exponent_indicator_re       = r'[eE]'
signed_integer_re           = r'[-+]?{DecimalDigits}'.format(DecimalDigits=decimal_digits_re)

decimal_integer_literal_re  = '0|({NonZeroDigit}{DecimalDigit}*)' \
                                    .format(NonZeroDigit=non_zero_digit_re, DecimalDigit=decimal_digit_re)

exponent_part_re            = '({ExponentIndicator}{SignedInteger})' \
                                    .format(ExponentIndicator=exponent_indicator_re, SignedInteger=signed_integer_re)

decimal_literal_re          = r'({DecimalIntegerLiteral}\.({DecimalDigits})?{ExponentPart}?)|(\.{DecimalDigits}{ExponentPart}?)|({DecimalIntegerLiteral}{ExponentPart}?)' \
                                    .format(DecimalIntegerLiteral=decimal_integer_literal_re,
                                            DecimalDigits=decimal_digits_re,
                                            ExponentPart=exponent_part_re)

@TOKEN(decimal_literal_re)
def t_NUMBER_decimal_literal(t):
    t.type = 'NUMBER_LITERAL'
    #t.value = float(t.value)
    return t

"""
7.8.5 Regular Expression Literals
"""

reg_ex_backslash_sequence_re = r'\\[^\n\r]'

reg_ex_flags_re         = r'({IdentifierPart})*'\
                            .format(IdentifierPart=identifier_part_re)

reg_ex_class_char_re    = r'[^\n\r\]\\]|{RegularExpressionBackslashSequence}'\
                            .format(RegularExpressionBackslashSequence=reg_ex_backslash_sequence_re)

reg_ex_class_re         = r'\[({RegularExpressionClassChar})*\]'\
                            .format(RegularExpressionClassChar=reg_ex_class_char_re)

reg_ex_char_re          = r'[^/\\\[\n\r]|{RegularExpressionBackslashSequence}|{RegularExpressionClass}'\
                            .format(RegularExpressionBackslashSequence=reg_ex_backslash_sequence_re,
                                    RegularExpressionClass=reg_ex_class_re)

reg_ex_chars_re         = r'({RegularExpressionChar})*'\
                            .format(RegularExpressionChar=reg_ex_char_re)

reg_ex_first_char_re    = r'[^\*/\\\[\n\r]|{RegularExpressionBackslashSequence}|{RegularExpressionClass}'\
                            .format(RegularExpressionClass=reg_ex_class_re,
                                    RegularExpressionBackslashSequence=reg_ex_backslash_sequence_re)

reg_ex_body_re          = r'({RegularExpressionFirstChar}){RegularExpressionChars}'\
                            .format(RegularExpressionFirstChar=reg_ex_first_char_re,
                                    RegularExpressionChars=reg_ex_chars_re)

reg_ex_literal_re       = r'/({RegularExpressionBody})/{RegularExpressionFlags}'\
                            .format(RegularExpressionBody=reg_ex_body_re,
                                    RegularExpressionFlags=reg_ex_flags_re)

@TOKEN(reg_ex_literal_re)
def t_REGEX_literal(t):
    t.type = 'REGEX_LITERAL'
    return t

#
#   Punctuators (7.7)
#
punctuators = (
    'LBRACE',           # {
    'RBRACE',           # }
    'LPAREN',           # (
    'RPAREN',           # )
    'LBRACKET',         # [
    'RBRACKET',         # ]
    'DOT',              # .
    'SEMI',             # ;
    'COMMA',            # ,
    'PLUS',             # +
    'MINUS',            # -
    'MULTIPLY',         # *
    'MODULO',           # %
    'PLUSPLUS',         # ++
    'MINUSMINUS',       # --
    'EQUAL',            # =
    'PLUSEQUAL',        # +=
    'MINUSEQUAL',       # -=
    'MULTIPLYEQUAL',    # *=
    'MODULOEQUAL',      # %=
    'LSHIFTEQUAL',      # <<=
    'RSHIFTEQUAL',      # >>=
    'LOGRSHIFTEQUAL',   # >>>=
    'ANDEQUAL',         # &=
    'OREQUAL',          # |=
    'XOREQUAL',         # ^=
    'LSHIFT',           # <<
    'RSHIFT',           # >>
    'LOGRSHIFT',        # >>>
    'AND',              # &
    'OR',               # |
    'XOR',              # ^
    'BWNOT',            # ~
    'LT',               # <
    'GT',               # >
    'LTE',              # <=
    'GTE',              # >=,
    'EQUALV',           # ==  (value, with type coercion)
    'NOTEQUALV',        # !=  (value, with type coercion)
    'EQUALVT',          # === (value and type)
    'NOTEQUALVT',       # !== (value and type)
    'TERNARY',          # ?
    'COLON',            # :
    'ANDAND',           # &&,
    'NOT',              # !
    'OROR'              # ||
)

div_punctuators = (
    'FSLASH',           # /
    'FSLASHEQUAL'       # /=
)

aux = (
    'SPACE',
    'COMMENT',
    'LINE_COMMENT'
)

tokens = (
    identifer +
    keywords +
    future_keywords +
    future_strict_keywords +
    punctuators +
    div_punctuators +
    literal_keywords +
    aux
)

# punctuators
t_LBRACE        = r'{'
t_RBRACE        = r'}'
t_LPAREN        = r'\('
t_RPAREN        = r'\)'
t_LBRACKET      = r'\['
t_RBRACKET      = r'\]'
t_DOT           = r'\.'
t_SEMI          = r';'
t_COMMA         = r','
t_LT            = r'<'
t_GT            = r'>'
t_LTE           = r'<='
t_GTE           = r'>='
t_EQUALVT       = r'==='
t_NOTEQUALVT    = r'!=='
t_EQUALV        = r'=='
t_NOTEQUALV     = r'!='
t_PLUSPLUS      = r'\+\+'
t_MINUSMINUS    = r'--'
t_PLUS          = r'\+'
t_MINUS         = r'-'
t_MULTIPLY      = r'\*'
t_MODULO        = r'%'
t_LSHIFT        = r'<<'
t_RSHIFT        = r'>>'
t_LOGRSHIFT     = r'>>>'
t_AND           = r'&'
t_OR            = r'\|'
t_XOR           = r'\^'
t_BWNOT         = r'~'
t_NOT           = r'!'
t_ANDAND        = r'&&'
t_OROR          = r'\|\|'
t_TERNARY       = r'\?'
t_COLON         = r':'
t_EQUAL         = r'='
t_PLUSEQUAL     = r'\+='
t_MINUSEQUAL    = r'-='
t_MULTIPLYEQUAL = r'\*='
t_MODULOEQUAL   = r'%='
t_LSHIFTEQUAL   = r'<<='
t_RSHIFTEQUAL   = r'>>='
t_LOGRSHIFTEQUAL = r'>>>='
t_ANDEQUAL      = r'&='
t_OREQUAL       = r'\|='
t_XOREQUAL      = r'\^='

# div punctuators
t_FSLASH        = r'/'
t_FSLASHEQUAL   = r'/='

def t_SPACE(t):
    r'[ \t\r\n]+'
    t.lexer.lineno += t.value.count('\n')
    return t


def calc_column(input, lexpos):
    lineStart = input.rfind('\n', 0, lexpos) + 1
    return (lexpos - lineStart) + 1

def t_ANY_error(t):
    print('htmllex: Illegal character {0} at {1}:{2}'.format(repr(t.value[0]), t.lineno, calc_column(t.lexer.lexdata, t.lexpos)))
    t.lexer.begin('INITIAL')
    t.value = t.value[0]
    t.lexer.skip(1)
    return t
