/* HTMLLexer1.java is a generated file. You probably want to
* edit HTMLLexer1.lex to make changes. Use JFlex to generate it.
* To generate HTMLLexer1.java
* Install JFlex v1.3.2 or later.
* Once JFlex is in your classpath run
* java JFlex.Main HTMLLexer1.lex
* You will then have a file called HTMLLexer1.java
*/
/*
* This file is part of a syntax
* highlighting package.
* Copyright (C) 1999-2002 Stephen Ostermiller
* http://ostermiller.org/contact.pl?regarding=Syntax+Highlighting
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* See COPYING.TXT for details.
*/
package com.Ostermiller.Syntax.Lexer;
import java.io.*;
/**
* HTMLLexer1 is a html 2.0 lexer. Created with JFlex. An example of how it is used:
*
*
*
*
* HTMLLexer1 shredder = new HTMLLexer1(System.in);
* HTMLToken1 t;
* while ((t = shredder.getNextToken()) != null){
* System.out.println(t);
* }
*
*
* There are two HTML Lexers that come with this package. HTMLLexer is a basic HTML lexer
* that knows the difference between tags, text, and comments. HTMLLexer1 knows something
* about the structure of tags and can return names and values from name value pairs. It
* also knows about text elements such as words and character references. The two are
* similar but which you should use depends on your purpose. In my opinion the HTMLLexer1
* is much better for syntax highlighting.
*
* @see HTMLLexer
* @see HTMLToken1
*/
%%
%public
%class HTMLLexer1
%implements Lexer
%function getNextToken
%type Token
%{
private int lastToken;
private int nextState=YYINITIAL;
/**
* next Token method that allows you to control if whitespace and comments are
* returned as tokens.
*/
public Token getNextToken(boolean returnComments, boolean returnWhiteSpace)throws IOException{
Token t = getNextToken();
while (t != null && ((!returnWhiteSpace && t.isWhiteSpace()) || (!returnComments && t.isComment()))){
t = getNextToken();
}
return (t);
}
/**
* Prints out tokens from a file or System.in.
* If no arguments are given, System.in will be used for input.
* If more arguments are given, the first argument will be used as
* the name of the file to use as input
*
* @param args program arguments, of which the first is a filename
*/
public static void main(String[] args) {
InputStream in;
try {
if (args.length > 0){
File f = new File(args[0]);
if (f.exists()){
if (f.canRead()){
in = new FileInputStream(f);
} else {
throw new IOException("Could not open " + args[0]);
}
} else {
throw new IOException("Could not find " + args[0]);
}
} else {
in = System.in;
}
HTMLLexer1 shredder = new HTMLLexer1(in);
Token t;
while ((t = shredder.getNextToken()) != null) {
if (t.getID() != CToken.WHITE_SPACE){
System.out.println(t);
}
}
} catch (IOException e){
System.out.println(e.getMessage());
}
}
/**
* Closes the current input stream, and resets the scanner to read from a new input stream.
* All internal variables are reset, the old input stream cannot be reused
* (content of the internal buffer is discarded and lost).
* The lexical state is set to the initial state.
* Subsequent tokens read from the lexer will start with the line, char, and column
* values given here.
*
* @param reader The new input.
* @param yyline The line number of the first token.
* @param yychar The position (relative to the start of the stream) of the first token.
* @param yycolumn The position (relative to the line) of the first token.
* @throws IOException if an IOExecption occurs while switching readers.
*/
public void reset(java.io.Reader reader, int yyline, int yychar, int yycolumn) throws IOException{
yyreset(reader);
this.yyline = yyline;
this.yychar = yychar;
this.yycolumn = yycolumn;
}
%}
%line
%char
%column
%full
%state START_TAG
%state START_END_TAG
%state START_DOC_TAG
%state TAG
%state START_EQUAL
%state START_VALUE
%state SCRIPT_TAG
%state START_SCRIPT_EQUAL
%state START_SCRIPT_VALUE
%state SCRIPT
%state PRE_TAG
%state START_PRE_EQUAL
%state START_PRE_VALUE
%state PRE
%state TEXTAREA_TAG
%state START_TEXTAREA_EQUAL
%state START_TEXTAREA_VALUE
%state TEXTAREA
%state TAG_END
%state DOCTYPE
%state COMMENT_DEF
%state FINISH_END_TAG
Digit=([0-9])
Letter=([a-zA-Z])
HexDigit=({Digit}|[a-fA-F])
BLANK=([ ])
TAB=([\t])
FF=([\f])
CR=([\r])
LF=([\n])
EOL=({CR}|{LF}|{CR}{LF})
WhiteSpace=({BLANK}|{TAB}|{FF}|{EOL})
AnyChar=([^])
DecCharRef=("#"{Digit}+)
HexCharRef=("#"[Xx]{HexDigit}+)
ECR1=("nbsp"|"iexcl"|"cent"|"pound"|"curren"|"yen"|"brvbar"|"sect"|"uml"|"copy"|"ordf"|"laquo"|"not"|"shy"|"reg"|"macr"|"deg")
ECR2=("plusmn"|"sup2"|"sup3"|"acute"|"micro"|"para"|"middot"|"cedil"|"sup1"|"ordm"|"raquo"|"frac14"|"frac12"|"frac34"|"iquest")
ECR3=("Agrave"|"Aacute"|"Acirc"|"Atilde"|"Auml"|"Aring"|"AElig"|"Ccedil"|"Egrave"|"Eacute"|"Ecirc"|"Euml"|"Igrave"|"Iacute")
ECR4=("Icirc"|"Iuml"|"ETH"|"Ntilde"|"Ograve"|"Oacute"|"Ocirc"|"Otilde"|"Ouml"|"times"|"Oslash"|"Ugrave"|"Uacute"|"Ucirc")
ECR5=("Uuml"|"Yacute"|"THORN"|"szlig"|"agrave"|"aacute"|"acirc"|"atilde"|"auml"|"aring"|"aelig"|"ccedil"|"egrave"|"eacute")
ECR6=("ecirc"|"euml"|"igrave"|"iacute"|"icirc"|"iuml"|"eth"|"ntilde"|"ograve"|"oacute"|"ocirc"|"otilde"|"ouml"|"divide")
ECR7=("oslash"|"ugrave"|"uacute"|"ucirc"|"uuml"|"yacute"|"thorn"|"yuml"|"fnof"|"Alpha"|"Beta"|"Gamma"|"Delta"|"Epsilon")
ECR8=("Zeta"|"Eta"|"Theta"|"Iota"|"Kappa"|"Lambda"|"Mu"|"Nu"|"Xi"|"Omicron"|"Pi"|"Rho"|"Sigma"|"Tau"|"Upsilon"|"Phi"|"Chi")
ECR9=("Psi"|"Omega"|"alpha"|"beta"|"gamma"|"delta"|"epsilon"|"zeta"|"eta"|"theta"|"iota"|"kappa"|"lambda"|"mu"|"nu"|"xi")
ECR10=("omicron"|"pi"|"rho"|"sigmaf"|"sigma"|"tau"|"upsilon"|"phi"|"chi"|"psi"|"omega"|"thetasym"|"upsih"|"piv"|"bull")
ECR11=("hellip"|"prime"|"Prime"|"oline"|"frasl"|"weierp"|"image"|"real"|"trade"|"larr"|"uarr"|"rarr"|"darr"|"harr"|"crarr")
ECR12=("lArr"|"uArr"|"rArr"|"dArr"|"hArr"|"forall"|"part"|"exist"|"empty"|"nabla"|"isin"|"notin"|"ni"|"prod"|"sum"|"minus")
ECR13=("lowast"|"radic"|"prop"|"infin"|"ang"|"and"|"or"|"cap"|"cup"|"int"|"there4"|"sim"|"cong"|"asymp"|"ne"|"equiv"|"le")
ECR14=("ge"|"sub"|"sup"|"nsub"|"sube"|"supe"|"oplus"|"otimes"|"perp"|"sdot"|"lceil"|"rceil"|"lfloor"|"rfloor"|"lang"|"rang")
ECR15=("loz"|"spades"|"clubs"|"hearts"|"diams"|"quot"|"amp"|"lt"|"gt"|"OElig"|"oelig"|"Scaron"|"scaron"|"Yuml"|"circ"|"tilde")
ECR16=("ensp"|"emsp"|"thinsp"|"zwnj"|"zwj"|"lrm"|"rlm"|"ndash"|"mdash"|"lsquo"|"rsquo"|"sbquo"|"ldquo"|"rdquo"|"bdquo"|"dagger")
ECR17=("Dagger"|"permil"|"lsaquo"|"rsaquo"|"euro")
EntityCharRef=({ECR1}|{ECR2}|{ECR3}|{ECR4}|{ECR5}|{ECR6}|{ECR7}|{ECR8}|{ECR9}|{ECR10}|{ECR11}|{ECR12}|{ECR13}|{ECR14}|{ECR15}|{ECR16}|{ECR17})
CharacterReference=("&"({EntityCharRef}|{DecCharRef}|{HexCharRef})";"?)
FalseCharRef=("&"[^\ \r\n\<\;\&]*)
StringLiteral=(([\"][^\"]*[\"])|([\'][^\']*[\']))
NameToken=(({Letter}|{Digit}|[\.\-])+)
Value=({NameToken}|{StringLiteral})
TagStart=("<")
EndTagStart=("")
DocTagStart=("")
Doctype=([Dd][Oo][Cc][Tt][Yy][Pp][Ee])
DoctypeText=(([^\>\"\']|{StringLiteral})*)
NameScript=([Ss][Cc][Rr][Ii][Pp][Tt])
FalseEndScript=([\<]|[\<][\/]{WhiteSpace}*|[\<][\/]{WhiteSpace}*[Ss]|[\<][\/]{WhiteSpace}*[Ss][Cc]|[\<][\/]{WhiteSpace}*[Ss][Cc][Rr]|[\<][\/]{WhiteSpace}*[Ss][Cc][Rr][Ii]|[\<][\/]{WhiteSpace}*[Ss][Cc][Rr][Ii][Pp]|[\<][\/]{WhiteSpace}*[Ss][Cc][Rr][Ii][Pp][Tt]{WhiteSpace}*)
ScriptText=({CommentDeclaration}|([^\<]|{FalseEndScript}*[\<][^\/\<]|{FalseEndScript}*[\<][\/]{WhiteSpace}*[^Ss\<]|{FalseEndScript}*[\<][\/]{WhiteSpace}*[Ss][^Cc\<]|{FalseEndScript}*[\<][\/]{WhiteSpace}*[Ss][Cc][^Rr\<]|{FalseEndScript}*[\<][\/]{WhiteSpace}*[Ss][Cc][Rr][^Ii\<]|{FalseEndScript}*[\<][\/]{WhiteSpace}*[Ss][Cc][Rr][Ii][^Pp\<]|{FalseEndScript}*[\<][\/]{WhiteSpace}*[Ss][Cc][Rr][Ii][Pp][^Tt\<]|{FalseEndScript}*[\<][\/]{WhiteSpace}*[Ss][Cc][Rr][Ii][Pp][Tt]{WhiteSpace}*[^\<\>])*)
NamePre=([Pp][Rr][Ee])
FalseEndPre=([\<]|[\<][\/]{WhiteSpace}*|[\<][\/]{WhiteSpace}*[Pp]|[\<][\/]{WhiteSpace}*[Pp][Rr]|[\<][\/]{WhiteSpace}*[Pp][Rr][Ee]{WhiteSpace}*)
PreText=(([^\<]|{FalseEndPre}*[\<][^\/\<]|{FalseEndPre}*[\<][\/]{WhiteSpace}*[^Pp\<]|{FalseEndPre}*[\<][\/]{WhiteSpace}*[Pp][^Rr\<]|{FalseEndPre}*[\<][\/]{WhiteSpace}*[Pp][Rr][^Ee\<]|{FalseEndPre}*[\<][\/]{WhiteSpace}*[Pp][Rr][Ee]{WhiteSpace}*[^\<\>])*)
NameTextArea=([Tt][Ee][Xx][Tt][Aa][Rr][Ee][Aa])
FalseEndTextArea=([\<]|[\<][\/]{WhiteSpace}*|[\<][\/]{WhiteSpace}*[Tt]|[\<][\/]{WhiteSpace}*[Tt][Ee]|[\<][\/]{WhiteSpace}*[Tt][Ee][Xx]|[\<][\/]{WhiteSpace}*[Tt][Ee][Xx][Tt]|[\<][\/]{WhiteSpace}*[Tt][Ee][Xx][Tt][Aa]|[\<][\/]{WhiteSpace}*[Tt][Ee][Xx][Tt][Aa][Rr]|[\<][\/]{WhiteSpace}*[Tt][Ee][Xx][Tt][Aa][Rr][Ee]|[\<][\/]{WhiteSpace}*[Tt][Ee][Xx][Tt][Aa][Rr][Ee][Aa]{WhiteSpace}*)
TextAreaText=(([^\<]|{FalseEndTextArea}*[\<][^\/\<]|{FalseEndTextArea}*[\<][\/]{WhiteSpace}*[^Tt\<]|{FalseEndTextArea}*[\<][\/]{WhiteSpace}*[Tt][^Ee\<]|{FalseEndTextArea}*[\<][\/]{WhiteSpace}*[Tt][Ee][^Xx\<]|{FalseEndTextArea}*[\<][\/]{WhiteSpace}*[Tt][Ee][Xx][^Tt\<]|{FalseEndTextArea}*[\<][\/]{WhiteSpace}*[Tt][Ee][Xx][Tt][^Aa\<]|{FalseEndTextArea}*[\<][\/]{WhiteSpace}*[Tt][Ee][Xx][Tt][Aa][^Rr\<]|{FalseEndTextArea}*[\<][\/]{WhiteSpace}*[Tt][Ee][Xx][Tt][Aa][Rr][^Ee\<]|{FalseEndTextArea}*[\<][\/]{WhiteSpace}*[Tt][Ee][Xx][Tt][Aa][Rr][Ee][^Aa\<]|{FalseEndTextArea}*[\<][\/]{WhiteSpace}*[Tt][Ee][Xx][Tt][Aa][Rr][Ee][Aa]{WhiteSpace}*[^\<\>])*)
SGMLProcessingStart=("")
SGMLProcessingEnd=([\?]+">")
ASPProcessingStart=("<%")
ASPProcessingEnd=([\%]+">")
SGMLProcessingText=(([^\?]|[\?]+[^\>\?])*)
SGMLProcessing=({SGMLProcessingStart}{SGMLProcessingText}{SGMLProcessingEnd})
ASPProcessingText=(([^\%]|[\%]+[^\>\%])*)
ASPProcessing=({ASPProcessingStart}{ASPProcessingText}{ASPProcessingEnd})
Name=({Letter}({Letter}|{Digit}|[\.\-])*)
EndTag=({EndTagStart}{WhiteSpace}*{Name}{WhiteSpace}*{TagEnd})
Comment=("--"([^\-]|([\-][^\-]))*"--")
FalseComment=([^\>])
CommentDeclaration=({DocTagStart}((({WhiteSpace}*){Comment}({WhiteSpace}*))*){TagEnd})
Word=(([^\ \r\n\f\t\<\&])*)
StartTagUnmatched=([^a-zA-Z\>])
StartEndTagUnmatched=([^a-zA-Z\>\ \r\n])
FinishEndTagUnmatched=([^\>\ \r\n])
TagUnmatched=([^a-zA-Z0-9\ \r\n\-\.\>])
StartEqualUnmatched=([^a-zA-Z0-9\ \r\n\-\.\>\=])
StartValueUnmatched=([^a-zA-Z0-9\ \r\n\-\.\>\"\'])
UnclosedStringLiteral=(([\"][^\"]*)|([\'][^\']*))
DocTagUnmatched=({WhiteSpace}*(([Dd][^Oo\>])|([Dd][Oo][^Cc\>])|([Dd][Oo][Cc][^Tt\>])|([Dd][Oo][Cc][Tt][^Yy\>])|([Dd][Oo][Cc][Tt][Yy][^Pp\>])|([Dd][Oo][Cc][Tt][Yy][Pp][^Ee\>])|([^Dd\-\ \r\n\>])|([\-][^\-\>])))
EndTagError=([^\>]*{TagEnd}?)
%%