Quadcap Embeddable Server

com/quadcap/http/client/HtmlParser.java

Go to the documentation of this file.
00001 package com.quadcap.http.client; 00002 00003 /* Copyright 1999 - 2003 Quadcap Software. All rights reserved. 00004 * 00005 * This software is distributed under the Quadcap Free Software License. 00006 * This software may be used or modified for any purpose, personal or 00007 * commercial. Open Source redistributions are permitted. Commercial 00008 * redistribution of larger works derived from, or works which bundle 00009 * this software requires a "Commercial Redistribution License"; see 00010 * http://www.quadcap.com/purchase. 00011 * 00012 * Redistributions qualify as "Open Source" under one of the following terms: 00013 * 00014 * Redistributions are made at no charge beyond the reasonable cost of 00015 * materials and delivery. 00016 * 00017 * Redistributions are accompanied by a copy of the Source Code or by an 00018 * irrevocable offer to provide a copy of the Source Code for up to three 00019 * years at the cost of materials and delivery. Such redistributions 00020 * must allow further use, modification, and redistribution of the Source 00021 * Code under substantially the same terms as this license. 00022 * 00023 * Redistributions of source code must retain the copyright notices as they 00024 * appear in each source code file, these license terms, and the 00025 * disclaimer/limitation of liability set forth as paragraph 6 below. 00026 * 00027 * Redistributions in binary form must reproduce this Copyright Notice, 00028 * these license terms, and the disclaimer/limitation of liability set 00029 * forth as paragraph 6 below, in the documentation and/or other materials 00030 * provided with the distribution. 00031 * 00032 * The Software is provided on an "AS IS" basis. No warranty is 00033 * provided that the Software is free of defects, or fit for a 00034 * particular purpose. 00035 * 00036 * Limitation of Liability. Quadcap Software shall not be liable 00037 * for any damages suffered by the Licensee or any third party resulting 00038 * from use of the Software. 00039 */ 00040 00041 import java.io.CharArrayWriter; 00042 import java.io.IOException; 00043 import java.io.Reader; 00044 00045 import org.xml.sax.AttributeList; 00046 import org.xml.sax.DocumentHandler; 00047 import org.xml.sax.DTDHandler; 00048 import org.xml.sax.EntityResolver; 00049 import org.xml.sax.ErrorHandler; 00050 import org.xml.sax.InputSource; 00051 import org.xml.sax.Parser; 00052 import org.xml.sax.SAXException; 00053 00054 import org.xml.sax.helpers.AttributeListImpl; 00055 00056 import com.quadcap.util.collections.ArrayQueue; 00057 00058 /** 00059 * A SAX Parser for HTML. 00060 * 00061 * @author Stan Bailes 00062 */ 00063 public class HtmlParser implements Parser { 00064 InputSource in; 00065 Reader r; 00066 DocumentHandler docHandler = null; 00067 DTDHandler dtdHandler = null; 00068 EntityResolver entityResolver = null; 00069 CharArrayWriter tag = new CharArrayWriter(); 00070 CharArrayWriter data = new CharArrayWriter(); 00071 AttributeListImpl attributes = new AttributeListImpl(); 00072 String tagName = null; 00073 00074 final static int TAG = 1; 00075 00076 public HtmlParser() {} 00077 00079 this.in = in; 00080 this.r = in.getCharacterStream(); 00081 tag.reset(); 00082 data.reset(); 00083 parse(); 00084 } 00085 00086 public void parse(String s) {} 00087 00088 public void setDocumentHandler(DocumentHandler dh) { 00089 this.docHandler = dh; 00090 } 00091 00092 public void setDTDHandler(DTDHandler dh) { 00093 this.dtdHandler = dh; 00094 } 00095 00096 public void setEntityResolver(EntityResolver er) { 00097 this.entityResolver = er; 00098 } 00099 00100 public EntityResolver getEntityResolver() { 00101 return entityResolver; 00102 } 00103 00105 } 00106 00107 public void setLocale(java.util.Locale locale) { 00108 } 00109 00111 int state = 0; 00112 int commentState = 0; 00113 String attrName = null; 00114 docHandler.startDocument(); 00115 while (state >= 0) { 00116 int c = r.read(); 00117 //System.out.println("[" + ((char)c) + "] [" + state + "] <" + tag.toString() + ">"); 00118 if (c < 0) { 00119 state = -1; 00120 break; 00121 } 00122 switch (commentState) { 00123 case 0: 00124 break; 00125 case 1: 00126 if (c == '-') commentState = 2; 00127 break; 00128 case 2: 00129 if (c == '-') commentState = 3; 00130 else commentState = 1; 00131 break; 00132 case 3: 00133 if (c == '>') commentState = 0; 00134 else if (c != '-') commentState = 1; 00135 } 00136 00137 switch (state) { 00138 case 0: 00139 if (c == '<') { 00140 if (data.size() > 0) { 00141 docHandler.characters(data.toCharArray(), 0, data.size()); 00142 data.reset(); 00143 } 00144 state = 1; 00145 } else { 00146 data.write(c); 00147 } 00148 break; 00149 case 1: // seen '<' 00150 switch (c) { 00151 case '!': 00152 data.write('<'); 00153 data.write('!'); 00154 commentState = 1; 00155 state = 0; 00156 break; 00157 case '/': 00158 state = 8; 00159 break; 00160 default: 00161 tag.write(c); 00162 state = 5; 00163 break; 00164 } 00165 break; 00166 case 5: // collect tag name 00167 switch (c) { 00168 case ' ': 00169 tagName = tag.toString(); 00170 tag.reset(); 00171 state = 6; 00172 break; 00173 case '/': 00174 tagName = tag.toString(); 00175 tag.reset(); 00176 state = 9; 00177 break; 00178 case '>': 00179 tagName = tag.toString(); 00180 tag.reset(); 00181 docHandler.startElement(tagName, attributes); 00182 attributes.clear(); 00183 state = 0; 00184 break; 00185 default: 00186 tag.write(c); 00187 } 00188 break; 00189 case 6: // collect attributes 00190 switch (c) { 00191 case ' ': case '\n': case '\r': case '\t': 00192 break; 00193 case '/': 00194 state = 9; 00195 break; 00196 case '>': 00197 docHandler.startElement(tagName, attributes); 00198 attributes.clear(); 00199 state = 0; 00200 break; 00201 case '=': 00202 attrName = tag.toString(); 00203 tag.reset(); 00204 state = 10; 00205 break; 00206 default: 00207 tag.write(c); 00208 } 00209 break; 00210 case 8: // seen </ 00211 if (c == '>') { 00212 tagName = tag.toString(); 00213 tag.reset(); 00214 docHandler.endElement(tagName); 00215 state = 0; 00216 } else { 00217 tag.write(c); 00218 } 00219 break; 00220 case 9: // in <tag, seen / 00221 if (c == '>') { 00222 docHandler.startElement(tagName, attributes); 00223 attributes.clear(); 00224 docHandler.endElement(tagName); 00225 state = 0; 00226 } else { 00227 tag.write('/'); 00228 tag.write(c); 00229 state = 6; 00230 } 00231 break; 00232 case 10: // in attriblist, seen name= 00233 if (c == '"') { 00234 state = 12; 00235 } else if (c == '\'') { 00236 state = 121; 00237 } else { 00238 tag.write(c); 00239 state = 13; 00240 } 00241 break; 00242 case 12: // in attriblist, seen name=" 00243 if (c == '"') { 00244 attributes.addAttribute(attrName.toLowerCase(), "string", 00245 tag.toString()); 00246 tag.reset(); 00247 state = 6; 00248 } else { 00249 tag.write(c); 00250 } 00251 break; 00252 case 121: // in attriblist, seen name=' 00253 if (c == '\'') { 00254 attributes.addAttribute(attrName.toLowerCase(), "string", 00255 tag.toString()); 00256 tag.reset(); 00257 state = 6; 00258 } else { 00259 tag.write(c); 00260 } 00261 break; 00262 case 13: // in attriblist, seen name=c 00263 switch (c) { 00264 case ' ': 00265 attributes.addAttribute(attrName.toLowerCase(), "string", 00266 tag.toString()); 00267 tag.reset(); 00268 state = 6; 00269 break; 00270 case '/': 00271 state = 14; 00272 break; 00273 case '>': 00274 attributes.addAttribute(attrName.toLowerCase(), "string", 00275 tag.toString()); 00276 tag.reset(); 00277 docHandler.startElement(tagName, attributes); 00278 attributes.clear(); 00279 state = 0; 00280 break; 00281 default: 00282 tag.write(c); 00283 } 00284 break; 00285 case 14: // in attriblist, seen name=dfdf/ 00286 if (c == '>') { 00287 attributes.addAttribute(attrName.toLowerCase(), "string", 00288 tag.toString()); 00289 tag.reset(); 00290 docHandler.startElement(tagName, attributes); 00291 attributes.clear(); 00292 state = 0; 00293 } else { 00294 tag.write('/'); 00295 if (c != '/') { 00296 tag.write(c); 00297 state = 13; 00298 } 00299 } 00300 break; 00301 case 15: 00302 if (c == '-') state = 16; 00303 break; 00304 case 16: 00305 if (c == '-') state = 17; 00306 else state = 15; 00307 break; 00308 case 17: 00309 if (c == '>') state = 0; 00310 else if (c != '-') state = 15; 00311 break; 00312 } 00313 } 00314 } 00315 00316 }