Author: nextgens
Date: 2007-09-23 10:12:43 +0000 (Sun, 23 Sep 2007)
New Revision: 15284
Modified:
branches/freenet-jfk/src/freenet/clients/http/filter/CSSReadFilter.java
branches/freenet-jfk/src/freenet/clients/http/filter/CSSTokenizerFilter.jflex
branches/freenet-jfk/src/freenet/clients/http/filter/FilterCallback.java
branches/freenet-jfk/src/freenet/clients/http/filter/FoundURICallback.java
branches/freenet-jfk/src/freenet/clients/http/filter/HTMLFilter.java
branches/freenet-jfk/src/freenet/clients/http/filter/UnknownCharsetException.java
Log:
freenet-jfk: for some reason some files have been forgotten during the merge
before the last... this commit fixes it
Modified:
branches/freenet-jfk/src/freenet/clients/http/filter/CSSReadFilter.java
===================================================================
--- branches/freenet-jfk/src/freenet/clients/http/filter/CSSReadFilter.java
2007-09-23 10:06:39 UTC (rev 15283)
+++ branches/freenet-jfk/src/freenet/clients/http/filter/CSSReadFilter.java
2007-09-23 10:12:43 UTC (rev 15284)
@@ -15,9 +15,6 @@
import java.io.Writer;
import java.util.HashMap;
-import freenet.l10n.L10n;
-import freenet.support.HTMLEncoder;
-import freenet.support.HTMLNode;
import freenet.support.Logger;
import freenet.support.api.Bucket;
import freenet.support.api.BucketFactory;
Modified:
branches/freenet-jfk/src/freenet/clients/http/filter/CSSTokenizerFilter.jflex
===================================================================
---
branches/freenet-jfk/src/freenet/clients/http/filter/CSSTokenizerFilter.jflex
2007-09-23 10:06:39 UTC (rev 15283)
+++
branches/freenet-jfk/src/freenet/clients/http/filter/CSSTokenizerFilter.jflex
2007-09-23 10:12:43 UTC (rev 15284)
@@ -239,7 +239,7 @@
IDENT={NMSTART}{NMCHAR}*
UNOFFICIAL_IDENT="-"{IDENT}
NAME={NMCHAR}+
-NUM=[0-9]+|[0-9]*"."[0-9]+
+NUM=(-){0,1}([0-9]+|[0-9]*"."[0-9]+)
STRING={STRING1}|{STRING2}
// Not used any more. Was used in url(). Keep for now. Matches up to the end
of a bracket.
@@ -391,14 +391,6 @@
w.write(s);
if(debug) log("Matched ident: "+s);
}
-{UNOFFICIAL_IDENT} {
- if(debug) log("Deleted unofficial ident: "+yytext());
- w.write("/* " + l10n("deletedUnofficialIdent") + " */");
-}
-{UNOFFICIAL_IDENT}{W}":"{W}{REALURL} {
- if(debug) log("Deleted unofficial ident with url: "+yytext());
- w.write("/* " + l10n("deletedUnofficialIdentWithURL") + " */");
-}
"@page" {
String s = yytext();
w.write(s);
@@ -445,7 +437,6 @@
w.write(s);
if(debug) log("Matched number: "+s);
}
-
{MEDIUMS}{W}*";" {
if(postBadImportFlag) {
// Ignore
@@ -458,7 +449,6 @@
if(debug) log("Matched and passing on mediums list: "+s);
}
}
-
"@charset"{W}*{STRING}{W}*";" {
String s = yytext();
detectedCharset = s;
@@ -511,6 +501,14 @@
// Ignore
}
}
+{UNOFFICIAL_IDENT} {
+ if(debug) log("Deleted unofficial ident: "+yytext());
+ w.write("/* " + l10n("deletedUnofficialIdent") + " */");
+}
+{UNOFFICIAL_IDENT}{W}":"{W}{REALURL} {
+ if(debug) log("Deleted unofficial ident with url: "+yytext());
+ w.write("/* " + l10n("deletedUnofficialIdentWithURL") + " */");
+}
// Default rule matches only one character
. {
String s = yytext();
Modified:
branches/freenet-jfk/src/freenet/clients/http/filter/FilterCallback.java
===================================================================
--- branches/freenet-jfk/src/freenet/clients/http/filter/FilterCallback.java
2007-09-23 10:06:39 UTC (rev 15283)
+++ branches/freenet-jfk/src/freenet/clients/http/filter/FilterCallback.java
2007-09-23 10:12:43 UTC (rev 15284)
@@ -23,7 +23,11 @@
/**
* Process plain-text. Notification only; can't modify.
* Type can be null, or can correspond, for example to HTML tag name
around text
- * (for example: "title")
+ * (for example: "title").
+ *
+ * Note that the string will have been fed through the relevant decoder
if
+ * necessary (e.g. HTMLDecoder). It must be re-encoded if it is sent
out as
+ * text to a browser.
*/
public void onText(String s, String type);
Modified:
branches/freenet-jfk/src/freenet/clients/http/filter/FoundURICallback.java
===================================================================
--- branches/freenet-jfk/src/freenet/clients/http/filter/FoundURICallback.java
2007-09-23 10:06:39 UTC (rev 15283)
+++ branches/freenet-jfk/src/freenet/clients/http/filter/FoundURICallback.java
2007-09-23 10:12:43 UTC (rev 15284)
@@ -9,11 +9,25 @@
public interface FoundURICallback {
+ /**
+ * Called when a Freenet URI is found.
+ * @param uri The URI.
+ * FIXME: Indicate the type of the link e.g. inline image, hyperlink,
etc??
+ */
public void foundURI(FreenetURI uri);
- /* type can be null */
- /* but type can also be, for example, HTML tag name around text */
- /* Usefull to find things like titles */
- public void onText(String s, String type, URI baseURI);
+ /**
+ * Called when some plain text is processed. This is used typically by
+ * spiders to index pages by their content.
+ * @param text The text. Will already have been fed through whatever
decoding
+ * is necessary depending on the type of the source document e.g.
HTMLDecoder.
+ * Will need to be re-encoded before being sent to e.g. a browser.
+ * @param type Can be null, or may be for example the name of the HTML
tag
+ * directly surrounding the text. E.g. "title" lets you find page
titles.
+ * @param baseURI The current base URI for this page. The base URI is
not
+ * necessarily the URI of the page. It's the URI against which URIs on
the
+ * page are resolved. It defaults to the URI of the page but can be
overridden
+ * by base href in html, for example. */
+ public void onText(String text, String type, URI baseURI);
}
Modified: branches/freenet-jfk/src/freenet/clients/http/filter/HTMLFilter.java
===================================================================
--- branches/freenet-jfk/src/freenet/clients/http/filter/HTMLFilter.java
2007-09-23 10:06:39 UTC (rev 15283)
+++ branches/freenet-jfk/src/freenet/clients/http/filter/HTMLFilter.java
2007-09-23 10:12:43 UTC (rev 15284)
@@ -36,12 +36,14 @@
public class HTMLFilter implements ContentDataFilter, CharsetExtractor {
private static boolean logMINOR;
+ private static boolean logDEBUG;
private static boolean deleteWierdStuff = true;
private static boolean deleteErrors = true;
public Bucket readFilter(Bucket bucket, BucketFactory bf, String
charset, HashMap otherParams, FilterCallback cb) throws DataFilterException,
IOException {
logMINOR = Logger.shouldLog(Logger.MINOR, this);
+ logDEBUG = Logger.shouldLog(Logger.DEBUG, this);
if(logMINOR) Logger.minor(this, "readFilter():
charset="+charset);
InputStream strm = bucket.getInputStream();
BufferedInputStream bis = new BufferedInputStream(strm, 4096);
@@ -80,14 +82,28 @@
BufferedInputStream bis = new BufferedInputStream(strm, 4096);
Writer w = new NullWriter();
Reader r;
- r = new BufferedReader(new InputStreamReader(bis,
parseCharset), 4096);
+ try {
+ r = new BufferedReader(new InputStreamReader(bis,
parseCharset), 4096);
+ } catch (UnsupportedEncodingException e) {
+ strm.close();
+ throw e;
+ }
HTMLParseContext pc = new HTMLParseContext(r, w, null, new
NullFilterCallback(), true);
try {
pc.run(null);
+ } catch (IOException e) {
+ throw e;
} catch (Throwable t) {
// Ignore ALL errors
if(logMINOR) Logger.minor(this, "Caught "+t+" trying to
detect MIME type with "+parseCharset);
}
+ try {
+ r.close();
+ } catch (IOException e) {
+ throw e;
+ } catch (Throwable t) {
+ if(logMINOR) Logger.minor(this, "Caught "+t+" closing
stream after trying to detect MIME type with "+parseCharset);
+ }
if(logMINOR) Logger.minor(this, "Returning charset
"+pc.detectedCharset);
return pc.detectedCharset;
}
@@ -324,7 +340,7 @@
if(pc.noOutput) return;
- if(logMINOR) Logger.minor(this, "Saving text: "+s.toString());
+ if(logDEBUG) Logger.debug(this, "Saving text: "+s.toString());
if (pc.killText) {
return;
}
@@ -355,7 +371,7 @@
}
String sout = out.toString();
if(pc.cb != null)
- pc.cb.onText(sout, tagName); /* Tag name is given as
type for the text */
+ pc.cb.onText(HTMLDecoder.decode(sout), tagName); /* Tag
name is given as type for the text */
w.write(sout);
}
@@ -363,9 +379,9 @@
void processTag(Vector splitTag, Writer w, HTMLParseContext pc)
throws IOException, DataFilterException {
// First, check that it is a recognized tag
- if(logMINOR) {
+ if(logDEBUG) {
for(int i=0;i<splitTag.size();i++)
- Logger.minor(this,
"Tag["+i+"]="+splitTag.get(i));
+ Logger.debug(this,
"Tag["+i+"]="+splitTag.get(i));
}
ParsedTag t = new ParsedTag(splitTag);
if (!pc.killTag) {
@@ -404,7 +420,7 @@
if(s.charAt(s.length()-1) == '-')
s.setLength(s.length()-1);
}
- if(logMINOR) Logger.minor(this, "Saving comment:
"+s.toString());
+ if(logDEBUG) Logger.debug(this, "Saving comment:
"+s.toString());
if (pc.expectingBadComment)
return; // ignore it
@@ -486,13 +502,13 @@
unparsedAttrs[x - 1] = (String)
v.elementAt(x);
} else
unparsedAttrs = new String[0];
- if(logMINOR) Logger.minor(this, "Element = "+element);
+ if(logDEBUG) Logger.debug(this, "Element = "+element);
}
public ParsedTag sanitize(HTMLParseContext pc) throws
DataFilterException {
TagVerifier tv =
(TagVerifier)
allowedTagsVerifiers.get(element.toLowerCase());
- if(logMINOR) Logger.minor(this, "Got verifier: "+tv+"
for "+element);
+ if(logDEBUG) Logger.debug(this, "Got verifier: "+tv+"
for "+element);
if (tv == null) {
if (deleteWierdStuff) {
return null;
@@ -596,7 +612,8 @@
"u",
"noframes",
"fieldset",
- "noscript",
+// Delete <noscript> / </noscript>. So we can at least see the non-scripting
code.
+// "noscript",
"xmp",
"listing",
"plaintext",
@@ -1224,7 +1241,7 @@
Hashtable h,
Hashtable hn,
HTMLParseContext pc) throws DataFilterException {
- if(logMINOR) Logger.minor(this, "Finishing
script/style");
+ if(logDEBUG) Logger.debug(this, "Finishing
script/style");
// Finishing
setStyle(false, pc);
pc.styleScriptRecurseCount--;
@@ -1249,7 +1266,7 @@
}
Hashtable start(Hashtable h, Hashtable hn, HTMLParseContext pc)
throws DataFilterException {
- if(logMINOR) Logger.minor(this, "Starting
script/style");
+ if(logDEBUG) Logger.debug(this, "Starting
script/style");
pc.styleScriptRecurseCount++;
if (pc.styleScriptRecurseCount > 1) {
if (deleteErrors)
@@ -1481,7 +1498,7 @@
type = typesplit[0];
if ((typesplit[1] != null) &&
(typesplit[1].length() > 0))
charset = typesplit[1];
- if(logMINOR)
+ if(logDEBUG)
Logger.debug(
this,
"Processing link tag,
type="
@@ -1676,9 +1693,9 @@
} else if
(http_equiv.equalsIgnoreCase("Content-Type")) {
if(logMINOR) Logger.minor(this,
"Found http-equiv content-type="+content);
String[] typesplit =
splitType(content);
- if(logMINOR) {
+ if(logDEBUG) {
for(int
i=0;i<typesplit.length;i++)
-
Logger.minor(this, "["+i+"] = "+typesplit[i]);
+
Logger.debug(this, "["+i+"] = "+typesplit[i]);
}
if
(typesplit[0].equalsIgnoreCase("text/html")
&& ((typesplit[1] ==
null)
@@ -1846,7 +1863,7 @@
if ((s == null) || (s.length() == 0))
return null;
// Core.logger.log(SaferFilter.class, "Style now:
" + s, Logger.DEBUG);
- if(logMINOR) Logger.debug(HTMLFilter.class, "Style finally: " +
s);
+ if(logMINOR) Logger.minor(HTMLFilter.class, "Style finally: " +
s);
return s;
}
Modified:
branches/freenet-jfk/src/freenet/clients/http/filter/UnknownCharsetException.java
===================================================================
---
branches/freenet-jfk/src/freenet/clients/http/filter/UnknownCharsetException.java
2007-09-23 10:06:39 UTC (rev 15283)
+++
branches/freenet-jfk/src/freenet/clients/http/filter/UnknownCharsetException.java
2007-09-23 10:12:43 UTC (rev 15284)
@@ -6,7 +6,7 @@
import freenet.support.HTMLNode;
public class UnknownCharsetException extends DataFilterException {
-
+ private static final long serialVersionUID = 1L;
public final String charset;
private UnknownCharsetException(String warning, String warning2, String
string, HTMLNode explanation, String charset) {