import ply.lex as lex
from ply.lex import TOKEN

#list of tokens
tokens = (
    'COMMENT',
    'CONDITIONAL_COMMENT',
    'XML',
    'CDATA',
    'DTD',
    'SCRIPTLET',
    'SPACE',
    'SCRIPT_OPEN',
    'STYLE_OPEN',
    'HTML_TEXT',

    'SCRIPT_BODY',
    'SCRIPT_SHORT_BODY',

    'STYLE_BODY',
    'STYLE_SHORT_BODY',

    'TAG_OPEN',

    'TAG_CLOSE',
    'TAG_SLASH_CLOSE',
    'TAG_SLASH',
    'TAG_EQUALS',
    'TAG_NAME',

    'ATTRIBUTE',
)

states = (
    ('tag', 'exclusive'),
    ('attvalue', 'exclusive'),
    ('script', 'exclusive'),
    ('style', 'exclusive')
)


#tokens to ignore
#t_ignore = ' '#shortcut for whitespace

def t_COMMENT(t):
    r'<!--(.|\n)*?-->'
    t.lexer.lineno += t.value.count('\n')
    return t

def t_CONDITIONAL_COMMENT(t):
    r'<![(.|\n).*?]>'
    t.lexer.lineno += t.value.count('\n')
    return t

def t_XML(t):
    r'<?xml(.|\n)*?>'
    t.lexer.lineno += t.value.count('\n')
    return t

def t_CDATA(t):
    r'<![CDATA[(.|\n)*?]]>'
    t.lexer.lineno += t.value.count('\n')
    return t

def t_DTD(t):
    r'<!(.|\n)*?>'
    t.lexer.lineno += t.value.count('\n')
    return t

def t_SCRIPTLET(t):
    r'(<\?(.|\n)*?\?>|<%(.|\n)*?%>)'
    t.lexer.lineno += t.value.count('\n')
    return t

def t_SPACE(t):
    r'[ \t\r\n]+'
    t.lexer.lineno += t.value.count('\n')
    return t

def t_SCRIPT_OPEN(t):
    r'<script(.|\n)*?>'
    t.lexer.begin('script')
    t.lexer.lineno += t.value.count('\n')
    return t

def t_STYLE_OPEN(t):
    r'<style(.|\n)*?>'
    t.lexer.begin('style')
    t.lexer.lineno += t.value.count('\n')
    return t

def t_TAG_OPEN(t):
    r'<'
    t.lexer.begin('tag')
    return t

def t_HTML_TEXT(t):
    r'[^<]+'
    return t

# script state

def t_script_SCRIPT_BODY(t):
    r'(.|\n)*?</script>'
    t.lexer.begin('INITIAL')
    t.lexer.lineno += t.value.count('\n')
    return t

def t_script_SCRIPT_SHORT_BODY(t):
    r'(.|\n)*?</>'
    t.lexer.begin('INITIAL')
    t.lexer.lineno += t.value.count('\n')
    return t

# style state

def t_style_STYLE_BODY(t):
    r'(.|\n)*?</style>'
    t.lexer.begin('INITIAL')
    t.lexer.lineno += t.value.count('\n')
    return t

def t_style_STYLE_SHORT_BODY(t):
    r'(.|\n)*?</>'
    t.lexer.begin('INITIAL')
    t.lexer.lineno += t.value.count('\n')
    return t

# tag state

HEXDIGIT = r'[a-fA-F0-9]'
DIGIT = r'[0-9]'
TAG_NAME_START_CHAR = r'([:a-zA-Z]|\u2070..\u218F|\u2C00..\u2FEF|\u3001..\uD7FF|\uF900..\uFDCF|\uFDF0..\uFFFD)'
TAG_NAME_CHAR = r'(' + TAG_NAME_START_CHAR + r'|-|_|\.|' + DIGIT + r'|\u00B7|\u0300..\u036F|\u203F..\u2040)'
TAG_NAME = TAG_NAME_START_CHAR + TAG_NAME_CHAR + r'*'

def t_tag_TAG_CLOSE(t):
    r'>'
    t.lexer.begin('INITIAL')
    return t

def t_tag_TAG_SLASH_CLOSE(t):
    r'/>'
    t.lexer.begin('INITIAL')
    return t

def t_tag_TAG_SLASH(t):
    r'/'
    return t

def t_tag_TAG_EQUALS(t):
    '='
    t.lexer.begin('attvalue')
    return t

@TOKEN(TAG_NAME)
def t_tag_TAG_NAME(t):
    return t

def t_tag_SPACE(t):
    r'[ \t\r\n]+'
    t.lexer.lineno += t.value.count('\n')
    return t

# attribute value state

ATTCHAR = r'(-|_|\.|/|\+|,|\?|=|:|;|\#|[0-9a-zA-Z])'
ATTCHARS = ATTCHAR + r'+\ ?'
HEXCHARS = r'\#[0-9a-fA-F]+'
DECCHARS = r'[0-9]+%?'
DOUBLE_QUOTE_STRING = r'"[^<"]*"'
SINGLE_QUOTE_STRING = r"'[^<']*'"
ATTRIBUTE = r'(' + DOUBLE_QUOTE_STRING + r'|' + SINGLE_QUOTE_STRING + r'|' + ATTCHARS + r'|' + HEXCHARS + r'|' + DECCHARS + r')'

@TOKEN(ATTRIBUTE)
def t_attvalue_ATTRIBUTE(t):
    t.lexer.begin('tag')
    return t

def t_attvalue_SPACE(t):
    r'[ \t]+'
    return t


# restore after any errors by issuing an "error" token

def calc_column(input, lexpos):
    lineStart = input.rfind('\n', 0, lexpos) + 1
    return (lexpos - lineStart) + 1

def t_ANY_error(t):
    print('htmllex: Illegal character {0} at {1}:{2}'.format(repr(t.value[0]), t.lineno, calc_column(t.lexer.lexdata, t.lexpos)))
    t.lexer.begin('INITIAL')
    t.value = t.value[0]
    t.lexer.skip(1)
    return t

