00001
package com.quadcap.text.sax;
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
import java.io.CharArrayWriter;
00042
import java.io.IOException;
00043
import java.io.InputStreamReader;
00044
import java.io.Reader;
00045
00046
import org.xml.sax.DocumentHandler;
00047
import org.xml.sax.DTDHandler;
00048
import org.xml.sax.EntityResolver;
00049
import org.xml.sax.ErrorHandler;
00050
import org.xml.sax.HandlerBase;
00051
import org.xml.sax.InputSource;
00052
import org.xml.sax.SAXException;
00053
00054
import com.quadcap.text.NoStringPool;
00055
import com.quadcap.text.StringPool;
00056
import com.quadcap.util.collections.ArrayQueue;
00057
00058
import com.quadcap.util.Debug;
00059
00060
00061
00062
00063
00064
00065 public class Parser implements org.xml.sax.
Parser {
00066 boolean docStarted =
false;
00067 HandlerBase
defaultHandler =
new HandlerBase();
00068 StringPool pool =
new NoStringPool();
00069 InputSource
in;
00070 Reader
r;
00071 DocumentHandler
docHandler =
defaultHandler;
00072 DTDHandler
dtdHandler =
defaultHandler;
00073 EntityResolver
entityResolver =
defaultHandler;
00074 ErrorHandler
errorHandler =
defaultHandler;
00075 char[]
ebuf =
new char[6];
00076 char[]
tag =
new char[1024*32];
00077 int taglen = 0;
00078 CharArrayWriter
data =
new CharArrayWriter();
00079 AttributeList attributes =
new AttributeList();
00080 String
attrName = null;
00081 String
tagName = null;
00082 ArrayQueue
inStack = null;
00083 ArrayQueue
locStack = null;
00084 int lineNumber = 1;
00085 int columnNumber = 1;
00086 String
lastEntityVal =
"";
00087 boolean trace =
false;
00088 int commentLevel = 0;
00089
00090 public Parser() {}
00091
00093
this.in =
in;
00094
this.r =
getCharacterStream(
in);
00095
taglen = 0;
00096
lineNumber = 1;
00097
columnNumber = 1;
00098
data.reset();
00099
try {
00100
parse();
00102
if (
locStack != null) {
00103
for (
int i = 0; i <
locStack.size(); i++) {
00104 com.quadcap.util.Debug.println(
" at " +
locStack.top(i));
00105 }
00106 }
00107
throw ex;
00108 }
00109 }
00110
00111 final Reader
getCharacterStream(InputSource in) {
00112 Reader rd = in.getCharacterStream();
00113
if (rd == null) {
00114 rd =
new InputStreamReader(in.getByteStream());
00115 }
00116
return rd;
00117 }
00118
00119 public void pushInputSource(InputSource in2) {
00120
if (
inStack == null) {
00121
inStack =
new ArrayQueue();
00122
locStack =
new ArrayQueue();
00123 }
00124
inStack.push(
in);
00125
locStack.push(
"" +
lineNumber +
":" +
columnNumber);
00126
lineNumber = 1;
00127 columnNumber = 1;
00128
in = in2;
00129
r = getCharacterStream(
in);
00130 }
00131
00132 boolean popInputSource() {
00133
if (
inStack == null ||
inStack.size() == 0)
return false;
00134
in = (InputSource)
inStack.pop();
00135 String s =
locStack.pop().toString();
00136
int idx = s.indexOf(
':');
00137
lineNumber = Integer.parseInt(s.substring(0, idx));
00138
columnNumber = Integer.parseInt(s.substring(idx+1));
00139
r = getCharacterStream(
in);
00140
return true;
00141 }
00142
00145
tag[
taglen++] = (
char)c;
00146 }
00147
00148 public void parse(String s) {
00149 }
00150
00151 public void setDocumentHandler(DocumentHandler dh) {
00152
this.docHandler = dh;
00153 }
00154
00155 public void setDTDHandler(DTDHandler dh) {
00156
this.dtdHandler = dh;
00157 }
00158
00159 public void setEntityResolver(EntityResolver er) {
00160
this.entityResolver = er;
00161 }
00162
00163 public EntityResolver
getEntityResolver() {
00164
return entityResolver;
00165 }
00166
00167 public void setErrorHandler(ErrorHandler er) {
00168
errorHandler = er;
00169 }
00170
00171 public void setLocale(java.util.Locale locale) {
00172 }
00173
00174 final int read() throws IOException {
00175
int c =
r.read();
00176
if (c ==
'\n') {
00177
lineNumber++;
00178
columnNumber = 1;
00179 }
else {
00180
columnNumber++;
00181 }
00182
return c;
00183 }
00184
00186
int len = 0;
00187
int c;
00188
int state = 0;
00189
while ((c =
read()) >= 0) {
00190
ebuf[len++] = (
char)c;
00191
if (!Character.isLetter((
char)c) || len >=
ebuf.length)
break;
00192 }
00193
lastEntityVal =
new String(
ebuf, 0, len);
00194
if (len == 5 &&
ebuf[0] ==
'q' && ebuf[1] ==
'u' &&
00195 ebuf[2] ==
'o' && ebuf[3] ==
't') {
00196
return '"';
00197 }
00198
if (len == 4 && ebuf[0] ==
'a' && ebuf[1] ==
'm' && ebuf[2] ==
'p') {
00199
return '&';
00200 }
00201
if (len == 3) {
00202
if (ebuf[0] ==
'l') {
00203
if (ebuf[1] ==
't')
return '<';
00204 }
else if (ebuf[0] ==
'g') {
00205
if (ebuf[1] ==
't')
return '>';
00206 }
00207 }
00208
throw new SAXException(
"unknown entity: " +
lastEntityVal);
00209
00210 }
00211
00213
00214
00215
switch (state) {
00216
case 0:
00217
if (c ==
'<') {
00218
if (
data.size() > 0) {
00219
docHandler.characters(
data.toCharArray(), 0,
data.size());
00220
data.reset();
00221 }
00222 state = 1;
00223 }
else {
00224
if (c ==
'&') {
00225
try {
00226 c =
parseEntity();
00228
data.write(
'&');
00229
data.write(
lastEntityVal);
00230
break;
00231 }
00232 }
00233
data.write(c);
00234 }
00235
break;
00236
case 1:
00237
switch (c) {
00238
case '!':
00239 state = 30;
00240
break;
00241
case '\\':
00242 state = 4;
00243
break;
00244
case '/':
00245 state = 8;
00246
break;
00247
case '?':
00248
data.reset();
00249 state = 20;
00250
break;
00251
default:
00252 addTagChar(c);
00253 state = 5;
00254
break;
00255 }
00256
break;
00257
case 4:
00258
data.write(
'<');
00259
data.write(c);
00260 state = 0;
00261
break;
00262
case 5:
00263
switch (c) {
00264
case ' ':
case '\r':
case '\n':
case '\t':
00265
tagName =
pool.
intern(
tag, 0,
taglen);
00266
taglen = 0;
00267 state = 6;
00268
break;
00269
case '/':
00270
tagName =
pool.
intern(
tag, 0,
taglen);
00271
taglen = 0;
00272 state = 9;
00273
break;
00274
case '>':
00275
tagName =
pool.
intern(
tag, 0,
taglen);
00276
taglen = 0;
00277 state = 0;
00278
startElement(
tagName,
attributes);
00279
break;
00280
case '<':
00281
tagName =
pool.
intern(
tag, 0,
taglen);
00282
taglen = 0;
00283
if (
data.size() > 0) {
00284
docHandler.characters(
data.toCharArray(),
00285 0,
data.size());
00286
data.reset();
00287 }
00288 state = 1;
00289
break;
00290
default:
00291
if (Character.isLetter((
char)c) ||
00292 Character.isDigit((
char)c) ||
00293 c ==
'.' || c ==
'-' || c ==
'_' || c ==
':') {
00294 addTagChar(c);
00295 }
else {
00296
00297
00298
for (
int i = 0; i <
taglen; i++) {
00299
data.write(
tag[i]);
00300 }
00301
data.write(c);
00302 state = 0;
00303 taglen = 0;
00304
break;
00305 }
00306
00307 }
00308
break;
00309
case 6:
00310
switch (c) {
00311
case ' ':
case '\n':
case '\r':
case '\t':
00312
break;
00313
case '/':
00314 state = 9;
00315
break;
00316
case '%':
00317 addTagChar(c);
00318
break;
00319
case '>':
00320 state = 0;
00321
startElement(
tagName,
attributes);
00322
break;
00323
case '=':
00324
attrName =
pool.
intern(
tag, 0,
taglen);
00325
taglen = 0;
00326 state = 10;
00327
break;
00328
case '<':
00329 state = 61;
00330
break;
00331
default:
00332 addTagChar(c);
00333 }
00334
break;
00335
case 61:
00336
switch (c) {
00337
case '?':
00338 state = 62;
00339
break;
00340
default:
00341 addTagChar(
'<');
00342 addTagChar(c);
00343 state = 6;
00344
break;
00345 }
00346
break;
00347
case 62:
00348
switch (c) {
00349
case '?':
00350 state = 63;
00351
break;
00352
default:
00353 addTagChar(c);
00354
break;
00355 }
00356
break;
00357
case 63:
00358
switch(c) {
00359
case '>':
00360 addTagChar(c);
00361 state = 6;
00362
break;
00363
default:
00364 addTagChar(
'?');
00365
if (c !=
'?') state = 62;
00366
break;
00367 }
00368
break;
00369
case 8:
00370
if (c ==
'>') {
00371
tagName =
pool.
intern(
tag, 0,
taglen);
00372
taglen = 0;
00373 state = 0;
00374
docHandler.endElement(
tagName);
00375 }
else {
00376 addTagChar(c);
00377 }
00378
break;
00379
case 9:
00380
if (c ==
'>') {
00381
startElement(
tagName,
attributes);
00382 state = 0;
00383
docHandler.endElement(
tagName);
00384 }
else {
00385 addTagChar(
'/');
00386 addTagChar(c);
00387 state = 6;
00388 }
00389
break;
00390
case 10:
00391
if (c ==
'"') {
00392 state = 12;
00393 }
else if (c ==
'\'') {
00394 state = 121;
00395 }
else {
00396 addTagChar(c);
00397 state = 13;
00398 }
00399
break;
00400
case 12:
00401
if (c ==
'"') {
00402
attributes.
addAttribute(
attrName,
"CDATA",
00403
pool.
intern(
tag, 0,
taglen));
00404
taglen = 0;
00405 state = 6;
00406 }
else {
00407 addTagChar(c);
00408 }
00409
break;
00410
case 121:
00411
if (c ==
'\'') {
00412
attributes.
addAttribute(
attrName,
"CDATA",
00413
pool.
intern(
tag, 0,
taglen));
00414
taglen = 0;
00415 state = 6;
00416 }
else {
00417 addTagChar(c);
00418 }
00419
break;
00420
case 13:
00421
switch (c) {
00422
case ' ':
00423
attributes.
addAttribute(
attrName,
"CDATA",
00424
pool.
intern(
tag, 0,
taglen));
00425
taglen = 0;
00426 state = 6;
00427
break;
00428
case '/':
00429 state = 14;
00430
break;
00431
case '>':
00432
attributes.
addAttribute(
attrName,
"CDATA",
00433
pool.
intern(
tag, 0,
taglen));
00434
taglen = 0;
00435 state = 0;
00436
startElement(
tagName,
attributes);
00437
break;
00438
default:
00439 addTagChar(c);
00440 }
00441
break;
00442
case 14:
00443
if (c ==
'>') {
00444
attributes.
addAttribute(
attrName,
"CDATA",
00445
pool.
intern(
tag, 0,
taglen));
00446
taglen = 0;
00447 state = 0;
00448
startElement(
tagName,
attributes);
00449
docHandler.endElement(
tagName);
00450 }
else {
00451 addTagChar(
'/');
00452
if (c !=
'/') {
00453 addTagChar(c);
00454 state = 13;
00455 }
00456 }
00457
break;
00458
case 15:
00459
if (c ==
'-') state = 16;
00460
break;
00461
case 16:
00462
if (c ==
'-') state = 17;
00463
else state = 15;
00464
break;
00465
case 17:
00466
if (c ==
'>') state = 0;
00467
else if (c !=
'-') state = 15;
00468
break;
00469
case 20:
00470
if (c ==
'?') state = 21;
00471
else data.write(c);
00472
break;
00473
case 21:
00474
if (c ==
'>') {
00475 String s =
data.toString().trim();
00476
if (s.startsWith(
"xml")) {
00477
if (
inStack == null ||
inStack.size() == 0) {
00478
if (!
docStarted) {
00479
docStarted =
true;
00480
docHandler.startDocument();
00481 }
00482 }
00483 }
else {
00484
int idx = s.indexOf(
' ');
00485 String dat =
"";
00486 String target = s;
00487
if (idx >= 0) {
00488 target = s.substring(0, idx);
00489 dat = s.substring(idx+1).trim();
00490 }
00491
docHandler.processingInstruction(target, dat);
00492 }
00493
data.reset();
00494 state = 0;
00495 }
else {
00496
data.write(
'?');
00497
if (c !=
'?') {
00498
data.write(c);
00499 state = 20;
00500 }
00501 }
00502
break;
00503
case 30:
00504
if (c ==
'-') state = 31;
00505
else if (c ==
'[') state = 41;
00506
else state = 40;
00507
break;
00508
case 31:
00509
if (c ==
'-') {
00510
commentLevel = 1;
00511 state = 32;
00512 }
00513
else state = 40;
00514
break;
00515
case 32:
00516
if (c ==
'-') state = 33;
00517
else if (c ==
'<') state = 320;
00518
break;
00519
case 320:
00520
if (c ==
'!') state = 321;
00521
else if (c ==
'-') state = 33;
00522
else state = 32;
00523
break;
00524
case 321:
00525
if (c ==
'-') state = 322;
00526
else state = 32;
00527
break;
00528
case 322:
00529
if (c ==
'-') {
00530
commentLevel++;
00531 }
00532 state = 32;
00533
break;
00534
case 33:
00535
if (c ==
'-') state = 34;
00536
else state = 32;
00537
break;
00538
case 34:
00539
if (c ==
'>') {
00540
if (--
commentLevel == 0) {
00541 state = 0;
00542 }
else {
00543 state = 32;
00544 }
00545 }
00546
else if (c !=
'-') state = 32;
00547
break;
00548
case 40:
00549
if (c ==
'>') state = 0;
00550
break;
00551
case 41:
00552
if (c ==
'[') {
00553
if (
data.toString().equals(
"CDATA")) {
00554
data.reset();
00555 state = 42;
00556 }
else {
00557 state = 40;
00558 }
00559 }
else {
00560
data.write(c);
00561 }
00562
break;
00563
case 42:
00564
if (c ==
']') {
00565 state = 43;
00566 }
else {
00567
data.write(c);
00568 }
00569
break;
00570
case 43:
00571
if (c ==
']') {
00572 state = 44;
00573 }
else {
00574
data.write(
']');
00575
data.write(c);
00576 state = 42;
00577 }
00578
break;
00579
case 44:
00580
if (c ==
'>') {
00581 state = 0;
00582 }
else if (c ==
']') {
00583
data.write(
']');
00584 }
else {
00585
data.write(
"]]");
00586
data.write(c);
00587 state = 42;
00588 }
00589
break;
00590
default:
00592 }
00593
return state;
00594 }
00595
00597
int state = 0;
00598
docHandler.setDocumentLocator(
new Locator(
this));
00599
while (
parseUntilEOF()) {}
00600
docHandler.endDocument();
00601 }
00602
00604
boolean ret =
false;
00605
int state = 0;
00606
while (state >= 0) {
00607
int c =
read();
00608
if (c < 0) {
00609
try {
r.close(); }
catch (Exception e) {}
00610 ret =
popInputSource();
00611 state = -1;
00612 }
else {
00613 state = step(state, c);
00614 }
00615 }
00616
return ret;
00617 }
00618
00619 public int getLineNumber() {
00620
return lineNumber;
00621 }
00622
00623 public int getColumnNumber() {
00624
return columnNumber;
00625 }
00626
00628
if (!
docStarted) {
00629
docStarted =
true;
00630
docHandler.startDocument();
00631 }
00632
docHandler.startElement(
tagName,
attributes);
00633
attributes.
clear();
00634 }
00635
00636 }