00001 package com.quadcap.http.client;
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
import java.io.CharArrayWriter;
00042
import java.io.IOException;
00043
import java.io.Reader;
00044
00045
import org.xml.sax.AttributeList;
00046
import org.xml.sax.DocumentHandler;
00047
import org.xml.sax.DTDHandler;
00048
import org.xml.sax.EntityResolver;
00049
import org.xml.sax.ErrorHandler;
00050
import org.xml.sax.InputSource;
00051
import org.xml.sax.Parser;
00052
import org.xml.sax.SAXException;
00053
00054
import org.xml.sax.helpers.AttributeListImpl;
00055
00056
import com.quadcap.util.collections.ArrayQueue;
00057
00058
00059
00060
00061
00062
00063 public class HtmlParser implements Parser {
00064 InputSource
in;
00065 Reader
r;
00066 DocumentHandler
docHandler = null;
00067 DTDHandler
dtdHandler = null;
00068 EntityResolver
entityResolver = null;
00069 CharArrayWriter
tag =
new CharArrayWriter();
00070 CharArrayWriter
data =
new CharArrayWriter();
00071 AttributeListImpl
attributes =
new AttributeListImpl();
00072 String
tagName = null;
00073
00074 final static int TAG = 1;
00075
00076 public HtmlParser() {}
00077
00079
this.in =
in;
00080
this.r =
in.getCharacterStream();
00081
tag.reset();
00082
data.reset();
00083
parse();
00084 }
00085
00086 public void parse(String s) {}
00087
00088 public void setDocumentHandler(DocumentHandler dh) {
00089
this.docHandler = dh;
00090 }
00091
00092 public void setDTDHandler(DTDHandler dh) {
00093
this.dtdHandler = dh;
00094 }
00095
00096 public void setEntityResolver(EntityResolver er) {
00097
this.entityResolver = er;
00098 }
00099
00100 public EntityResolver
getEntityResolver() {
00101
return entityResolver;
00102 }
00103
00105 }
00106
00107 public void setLocale(java.util.Locale locale) {
00108 }
00109
00111
int state = 0;
00112
int commentState = 0;
00113 String attrName = null;
00114
docHandler.startDocument();
00115
while (state >= 0) {
00116
int c =
r.read();
00117
00118
if (c < 0) {
00119 state = -1;
00120
break;
00121 }
00122
switch (commentState) {
00123
case 0:
00124
break;
00125
case 1:
00126
if (c ==
'-') commentState = 2;
00127
break;
00128
case 2:
00129
if (c ==
'-') commentState = 3;
00130
else commentState = 1;
00131
break;
00132
case 3:
00133
if (c ==
'>') commentState = 0;
00134
else if (c !=
'-') commentState = 1;
00135 }
00136
00137
switch (state) {
00138
case 0:
00139
if (c ==
'<') {
00140
if (
data.size() > 0) {
00141
docHandler.characters(
data.toCharArray(), 0,
data.size());
00142
data.reset();
00143 }
00144 state = 1;
00145 }
else {
00146
data.write(c);
00147 }
00148
break;
00149
case 1:
00150
switch (c) {
00151
case '!':
00152
data.write(
'<');
00153
data.write(
'!');
00154 commentState = 1;
00155 state = 0;
00156
break;
00157
case '/':
00158 state = 8;
00159
break;
00160
default:
00161
tag.write(c);
00162 state = 5;
00163
break;
00164 }
00165
break;
00166
case 5:
00167
switch (c) {
00168
case ' ':
00169
tagName =
tag.toString();
00170
tag.reset();
00171 state = 6;
00172
break;
00173
case '/':
00174
tagName =
tag.toString();
00175
tag.reset();
00176 state = 9;
00177
break;
00178
case '>':
00179
tagName =
tag.toString();
00180
tag.reset();
00181
docHandler.startElement(
tagName,
attributes);
00182
attributes.clear();
00183 state = 0;
00184
break;
00185
default:
00186
tag.write(c);
00187 }
00188
break;
00189
case 6:
00190
switch (c) {
00191
case ' ':
case '\n':
case '\r':
case '\t':
00192
break;
00193
case '/':
00194 state = 9;
00195
break;
00196
case '>':
00197
docHandler.startElement(
tagName,
attributes);
00198
attributes.clear();
00199 state = 0;
00200
break;
00201
case '=':
00202 attrName =
tag.toString();
00203
tag.reset();
00204 state = 10;
00205
break;
00206
default:
00207
tag.write(c);
00208 }
00209
break;
00210
case 8:
00211
if (c ==
'>') {
00212
tagName =
tag.toString();
00213
tag.reset();
00214
docHandler.endElement(
tagName);
00215 state = 0;
00216 }
else {
00217
tag.write(c);
00218 }
00219
break;
00220
case 9:
00221
if (c ==
'>') {
00222
docHandler.startElement(
tagName,
attributes);
00223
attributes.clear();
00224
docHandler.endElement(
tagName);
00225 state = 0;
00226 }
else {
00227
tag.write(
'/');
00228
tag.write(c);
00229 state = 6;
00230 }
00231
break;
00232
case 10:
00233
if (c ==
'"') {
00234 state = 12;
00235 }
else if (c ==
'\'') {
00236 state = 121;
00237 }
else {
00238
tag.write(c);
00239 state = 13;
00240 }
00241
break;
00242
case 12:
00243
if (c ==
'"') {
00244
attributes.addAttribute(attrName.toLowerCase(),
"string",
00245
tag.toString());
00246
tag.reset();
00247 state = 6;
00248 }
else {
00249
tag.write(c);
00250 }
00251
break;
00252
case 121:
00253
if (c ==
'\'') {
00254
attributes.addAttribute(attrName.toLowerCase(),
"string",
00255
tag.toString());
00256
tag.reset();
00257 state = 6;
00258 }
else {
00259
tag.write(c);
00260 }
00261
break;
00262
case 13:
00263
switch (c) {
00264
case ' ':
00265
attributes.addAttribute(attrName.toLowerCase(),
"string",
00266
tag.toString());
00267
tag.reset();
00268 state = 6;
00269
break;
00270
case '/':
00271 state = 14;
00272
break;
00273
case '>':
00274
attributes.addAttribute(attrName.toLowerCase(),
"string",
00275
tag.toString());
00276
tag.reset();
00277
docHandler.startElement(
tagName,
attributes);
00278
attributes.clear();
00279 state = 0;
00280
break;
00281
default:
00282
tag.write(c);
00283 }
00284
break;
00285
case 14:
00286
if (c ==
'>') {
00287
attributes.addAttribute(attrName.toLowerCase(),
"string",
00288
tag.toString());
00289
tag.reset();
00290
docHandler.startElement(
tagName,
attributes);
00291
attributes.clear();
00292 state = 0;
00293 }
else {
00294
tag.write(
'/');
00295
if (c !=
'/') {
00296
tag.write(c);
00297 state = 13;
00298 }
00299 }
00300
break;
00301
case 15:
00302
if (c ==
'-') state = 16;
00303
break;
00304
case 16:
00305
if (c ==
'-') state = 17;
00306
else state = 15;
00307
break;
00308
case 17:
00309
if (c ==
'>') state = 0;
00310
else if (c !=
'-') state = 15;
00311
break;
00312 }
00313 }
00314 }
00315
00316 }