Quadcap Embeddable Database

com/quadcap/text/sax/Parser.java

Go to the documentation of this file.
00001 package com.quadcap.text.sax; 00002 00003 /* Copyright 1999 - 2003 Quadcap Software. All rights reserved. 00004 * 00005 * This software is distributed under the Quadcap Free Software License. 00006 * This software may be used or modified for any purpose, personal or 00007 * commercial. Open Source redistributions are permitted. Commercial 00008 * redistribution of larger works derived from, or works which bundle 00009 * this software requires a "Commercial Redistribution License"; see 00010 * http://www.quadcap.com/purchase. 00011 * 00012 * Redistributions qualify as "Open Source" under one of the following terms: 00013 * 00014 * Redistributions are made at no charge beyond the reasonable cost of 00015 * materials and delivery. 00016 * 00017 * Redistributions are accompanied by a copy of the Source Code or by an 00018 * irrevocable offer to provide a copy of the Source Code for up to three 00019 * years at the cost of materials and delivery. Such redistributions 00020 * must allow further use, modification, and redistribution of the Source 00021 * Code under substantially the same terms as this license. 00022 * 00023 * Redistributions of source code must retain the copyright notices as they 00024 * appear in each source code file, these license terms, and the 00025 * disclaimer/limitation of liability set forth as paragraph 6 below. 00026 * 00027 * Redistributions in binary form must reproduce this Copyright Notice, 00028 * these license terms, and the disclaimer/limitation of liability set 00029 * forth as paragraph 6 below, in the documentation and/or other materials 00030 * provided with the distribution. 00031 * 00032 * The Software is provided on an "AS IS" basis. No warranty is 00033 * provided that the Software is free of defects, or fit for a 00034 * particular purpose. 00035 * 00036 * Limitation of Liability. Quadcap Software shall not be liable 00037 * for any damages suffered by the Licensee or any third party resulting 00038 * from use of the Software. 00039 */ 00040 00041 import java.io.CharArrayWriter; 00042 import java.io.IOException; 00043 import java.io.InputStreamReader; 00044 import java.io.Reader; 00045 00046 import org.xml.sax.DocumentHandler; 00047 import org.xml.sax.DTDHandler; 00048 import org.xml.sax.EntityResolver; 00049 import org.xml.sax.ErrorHandler; 00050 import org.xml.sax.HandlerBase; 00051 import org.xml.sax.InputSource; 00052 import org.xml.sax.SAXException; 00053 00054 import com.quadcap.text.NoStringPool; 00055 import com.quadcap.text.StringPool; 00056 import com.quadcap.util.collections.ArrayQueue; 00057 00058 import com.quadcap.util.Debug; 00059 00060 /** 00061 * SAX Parser implementation. 00062 * 00063 * @author Stan Bailes 00064 */ 00065 public class Parser implements org.xml.sax.Parser { 00066 boolean docStarted = false; 00068 StringPool pool = new NoStringPool(); 00070 Reader r; 00075 char[] ebuf = new char[6]; 00076 char[] tag = new char[1024*32]; 00077 int taglen = 0; 00078 CharArrayWriter data = new CharArrayWriter(); 00079 AttributeList attributes = new AttributeList(); 00080 String attrName = null; 00081 String tagName = null; 00082 ArrayQueue inStack = null; 00083 ArrayQueue locStack = null; 00084 int lineNumber = 1; 00085 int columnNumber = 1; 00086 String lastEntityVal = ""; 00087 boolean trace = false; 00088 int commentLevel = 0; 00089 00090 public Parser() {} 00091 00093 this.in = in; 00094 this.r = getCharacterStream(in); 00095 taglen = 0; 00096 lineNumber = 1; 00097 columnNumber = 1; 00098 data.reset(); 00099 try { 00100 parse(); 00102 if (locStack != null) { 00103 for (int i = 0; i < locStack.size(); i++) { 00104 com.quadcap.util.Debug.println(" at " + locStack.top(i)); 00105 } 00106 } 00107 throw ex; 00108 } 00109 } 00110 00113 if (rd == null) { 00115 } 00116 return rd; 00117 } 00118 00120 if (inStack == null) { 00121 inStack = new ArrayQueue(); 00122 locStack = new ArrayQueue(); 00123 } 00124 inStack.push(in); 00125 locStack.push("" + lineNumber + ":" + columnNumber); 00126 lineNumber = 1; 00127 columnNumber = 1; 00128 in = in2; 00129 r = getCharacterStream(in); 00130 } 00131 00132 boolean popInputSource() { 00133 if (inStack == null || inStack.size() == 0) return false; 00135 String s = locStack.pop().toString(); 00136 int idx = s.indexOf(':'); 00137 lineNumber = Integer.parseInt(s.substring(0, idx)); 00138 columnNumber = Integer.parseInt(s.substring(idx+1)); 00139 r = getCharacterStream(in); 00140 return true; 00141 } 00142 00145 tag[taglen++] = (char)c; 00146 } 00147 00148 public void parse(String s) { 00149 } 00150 00152 this.docHandler = dh; 00153 } 00154 00156 this.dtdHandler = dh; 00157 } 00158 00160 this.entityResolver = er; 00161 } 00162 00164 return entityResolver; 00165 } 00166 00168 errorHandler = er; 00169 } 00170 00171 public void setLocale(java.util.Locale locale) { 00172 } 00173 00174 final int read() throws IOException { 00175 int c = r.read(); 00176 if (c == '\n') { 00177 lineNumber++; 00178 columnNumber = 1; 00179 } else { 00180 columnNumber++; 00181 } 00182 return c; 00183 } 00184 00186 int len = 0; 00187 int c; 00188 int state = 0; 00189 while ((c = read()) >= 0) { 00190 ebuf[len++] = (char)c; 00191 if (!Character.isLetter((char)c) || len >= ebuf.length) break; 00192 } 00193 lastEntityVal = new String(ebuf, 0, len); 00194 if (len == 5 && ebuf[0] == 'q' && ebuf[1] == 'u' && 00195 ebuf[2] == 'o' && ebuf[3] == 't') { 00196 return '"'; 00197 } 00198 if (len == 4 && ebuf[0] == 'a' && ebuf[1] == 'm' && ebuf[2] == 'p') { 00199 return '&'; 00200 } 00201 if (len == 3) { 00202 if (ebuf[0] == 'l') { 00203 if (ebuf[1] == 't') return '<'; 00204 } else if (ebuf[0] == 'g') { 00205 if (ebuf[1] == 't') return '>'; 00206 } 00207 } 00208 throw new SAXException("unknown entity: " + lastEntityVal); 00209 00210 } 00211 00213 // Debug.println("step[" + state + " " + commentLevel + 00214 // "]: " + ((char)c)); 00215 switch (state) { 00216 case 0: 00217 if (c == '<') { 00218 if (data.size() > 0) { 00220 data.reset(); 00221 } 00222 state = 1; 00223 } else { 00224 if (c == '&') { 00225 try { 00226 c = parseEntity(); 00228 data.write('&'); 00229 data.write(lastEntityVal); 00230 break; 00231 } 00232 } 00233 data.write(c); 00234 } 00235 break; 00236 case 1: // seen '<' 00237 switch (c) { 00238 case '!': 00239 state = 30; 00240 break; 00241 case '\\': 00242 state = 4; 00243 break; 00244 case '/': 00245 state = 8; 00246 break; 00247 case '?': 00248 data.reset(); 00249 state = 20; 00250 break; 00251 default: 00252 addTagChar(c); 00253 state = 5; 00254 break; 00255 } 00256 break; 00257 case 4: // seen <\ 00258 data.write('<'); 00259 data.write(c); 00260 state = 0; 00261 break; 00262 case 5: // collect tag name 00263 switch (c) { 00264 case ' ': case '\r': case '\n': case '\t': 00265 tagName = pool.intern(tag, 0, taglen); 00266 taglen = 0; 00267 state = 6; 00268 break; 00269 case '/': 00270 tagName = pool.intern(tag, 0, taglen); 00271 taglen = 0; 00272 state = 9; 00273 break; 00274 case '>': 00275 tagName = pool.intern(tag, 0, taglen); 00276 taglen = 0; 00277 state = 0; 00278 startElement(tagName, attributes); 00279 break; 00280 case '<': 00281 tagName = pool.intern(tag, 0, taglen); 00282 taglen = 0; 00283 if (data.size() > 0) { 00285 0, data.size()); 00286 data.reset(); 00287 } 00288 state = 1; 00289 break; 00290 default: 00291 if (Character.isLetter((char)c) || 00292 Character.isDigit((char)c) || 00293 c == '.' || c == '-' || c == '_' || c == ':') { 00294 addTagChar(c); 00295 } else { 00296 // this isn't a tag after all (e.g., inside a <script> 00297 // section, we've found "if (a < b) ..." 00298 for (int i = 0; i < taglen; i++) { 00299 data.write(tag[i]); 00300 } 00301 data.write(c); 00302 state = 0; 00303 taglen = 0; 00304 break; 00305 } 00306 00307 } 00308 break; 00309 case 6: // collect attributes 00310 switch (c) { 00311 case ' ': case '\n': case '\r': case '\t': 00312 break; 00313 case '/': 00314 state = 9; 00315 break; 00316 case '%': 00317 addTagChar(c); 00318 break; 00319 case '>': 00320 state = 0; 00321 startElement(tagName, attributes); 00322 break; 00323 case '=': 00324 attrName = pool.intern(tag, 0, taglen); 00325 taglen = 0; 00326 state = 10; 00327 break; 00328 case '<': 00329 state = 61; 00330 break; 00331 default: 00332 addTagChar(c); 00333 } 00334 break; 00335 case 61: 00336 switch (c) { 00337 case '?': 00338 state = 62; 00339 break; 00340 default: 00341 addTagChar('<'); 00342 addTagChar(c); 00343 state = 6; 00344 break; 00345 } 00346 break; 00347 case 62: 00348 switch (c) { 00349 case '?': 00350 state = 63; 00351 break; 00352 default: 00353 addTagChar(c); 00354 break; 00355 } 00356 break; 00357 case 63: 00358 switch(c) { 00359 case '>': 00360 addTagChar(c); 00361 state = 6; 00362 break; 00363 default: 00364 addTagChar('?'); 00365 if (c != '?') state = 62; 00366 break; 00367 } 00368 break; 00369 case 8: // seen </ 00370 if (c == '>') { 00371 tagName = pool.intern(tag, 0, taglen); 00372 taglen = 0; 00373 state = 0; 00375 } else { 00376 addTagChar(c); 00377 } 00378 break; 00379 case 9: // in <tag, seen / 00380 if (c == '>') { 00381 startElement(tagName, attributes); 00382 state = 0; 00384 } else { 00385 addTagChar('/'); 00386 addTagChar(c); 00387 state = 6; 00388 } 00389 break; 00390 case 10: // in attriblist, seen name= 00391 if (c == '"') { 00392 state = 12; 00393 } else if (c == '\'') { 00394 state = 121; 00395 } else { 00396 addTagChar(c); 00397 state = 13; 00398 } 00399 break; 00400 case 12: // in attriblist, seen name=" 00401 if (c == '"') { 00402 attributes.addAttribute(attrName, "CDATA", 00403 pool.intern(tag, 0, taglen)); 00404 taglen = 0; 00405 state = 6; 00406 } else { 00407 addTagChar(c); 00408 } 00409 break; 00410 case 121: // in attriblist, seen name=' 00411 if (c == '\'') { 00412 attributes.addAttribute(attrName, "CDATA", 00413 pool.intern(tag, 0, taglen)); 00414 taglen = 0; 00415 state = 6; 00416 } else { 00417 addTagChar(c); 00418 } 00419 break; 00420 case 13: // in attriblist, seen name=c 00421 switch (c) { 00422 case ' ': 00423 attributes.addAttribute(attrName, "CDATA", 00424 pool.intern(tag, 0, taglen)); 00425 taglen = 0; 00426 state = 6; 00427 break; 00428 case '/': 00429 state = 14; 00430 break; 00431 case '>': 00432 attributes.addAttribute(attrName, "CDATA", 00433 pool.intern(tag, 0, taglen)); 00434 taglen = 0; 00435 state = 0; 00436 startElement(tagName, attributes); 00437 break; 00438 default: 00439 addTagChar(c); 00440 } 00441 break; 00442 case 14: // in attriblist, seen name=dfdf/ 00443 if (c == '>') { 00444 attributes.addAttribute(attrName, "CDATA", 00445 pool.intern(tag, 0, taglen)); 00446 taglen = 0; 00447 state = 0; 00448 startElement(tagName, attributes); 00450 } else { 00451 addTagChar('/'); 00452 if (c != '/') { 00453 addTagChar(c); 00454 state = 13; 00455 } 00456 } 00457 break; 00458 case 15: 00459 if (c == '-') state = 16; 00460 break; 00461 case 16: 00462 if (c == '-') state = 17; 00463 else state = 15; 00464 break; 00465 case 17: 00466 if (c == '>') state = 0; 00467 else if (c != '-') state = 15; 00468 break; 00469 case 20: 00470 if (c == '?') state = 21; 00471 else data.write(c); 00472 break; 00473 case 21: 00474 if (c == '>') { 00475 String s = data.toString().trim(); 00476 if (s.startsWith("xml")) { 00477 if (inStack == null || inStack.size() == 0) { 00478 if (!docStarted) { 00479 docStarted = true; 00481 } 00482 } 00483 } else { 00484 int idx = s.indexOf(' '); 00485 String dat = ""; 00486 String target = s; 00487 if (idx >= 0) { 00488 target = s.substring(0, idx); 00489 dat = s.substring(idx+1).trim(); 00490 } 00492 } 00493 data.reset(); 00494 state = 0; 00495 } else { 00496 data.write('?'); 00497 if (c != '?') { 00498 data.write(c); 00499 state = 20; 00500 } 00501 } 00502 break; 00503 case 30: // seen <! 00504 if (c == '-') state = 31; 00505 else if (c == '[') state = 41; 00506 else state = 40; 00507 break; 00508 case 31: // seen <!- 00509 if (c == '-') { 00510 commentLevel = 1; 00511 state = 32; 00512 } 00513 else state = 40; 00514 break; 00515 case 32: // in comment, look for '-' 00516 if (c == '-') state = 33; 00517 else if (c == '<') state = 320; 00518 break; 00519 case 320: // in comment, seen < 00520 if (c == '!') state = 321; 00521 else if (c == '-') state = 33; 00522 else state = 32; 00523 break; 00524 case 321: // in comment, seen <! 00525 if (c == '-') state = 322; 00526 else state = 32; 00527 break; 00528 case 322: // in comment, seen <!- 00529 if (c == '-') { 00530 commentLevel++; 00531 } 00532 state = 32; 00533 break; 00534 case 33: // in comment, seen - 00535 if (c == '-') state = 34; 00536 else state = 32; 00537 break; 00538 case 34: // in comment, seen -- 00539 if (c == '>') { 00540 if (--commentLevel == 0) { 00541 state = 0; 00542 } else { 00543 state = 32; 00544 } 00545 } 00546 else if (c != '-') state = 32; 00547 break; 00548 case 40: // seen <!, but not comment 00549 if (c == '>') state = 0; 00550 break; 00551 case 41: // seen <![ 00552 if (c == '[') { 00553 if (data.toString().equals("CDATA")) { 00554 data.reset(); 00555 state = 42; 00556 } else { 00557 state = 40; 00558 } 00559 } else { 00560 data.write(c); 00561 } 00562 break; 00563 case 42: // in CDATA section 00564 if (c == ']') { 00565 state = 43; 00566 } else { 00567 data.write(c); 00568 } 00569 break; 00570 case 43: // in CDATA, seen ']' 00571 if (c == ']') { 00572 state = 44; 00573 } else { 00574 data.write(']'); 00575 data.write(c); 00576 state = 42; 00577 } 00578 break; 00579 case 44: // in CDATA, seen ']]' 00580 if (c == '>') { 00581 state = 0; 00582 } else if (c == ']') { 00583 data.write(']'); 00584 } else { 00585 data.write("]]"); 00586 data.write(c); 00587 state = 42; 00588 } 00589 break; 00590 default: 00592 } 00593 return state; 00594 } 00595 00597 int state = 0; 00599 while (parseUntilEOF()) {} 00601 } 00602 00604 boolean ret = false; 00605 int state = 0; 00606 while (state >= 0) { 00607 int c = read(); 00608 if (c < 0) { 00609 try { r.close(); } catch (Exception e) {} 00610 ret = popInputSource(); 00611 state = -1; 00612 } else { 00613 state = step(state, c); 00614 } 00615 } 00616 return ret; 00617 } 00618 00619 public int getLineNumber() { 00620 return lineNumber; 00621 } 00622 00623 public int getColumnNumber() { 00624 return columnNumber; 00625 } 00626 00628 if (!docStarted) { 00629 docStarted = true; 00631 } 00633 attributes.clear(); 00634 } 00635 00636 }