filter

[email protected] Sun, 23 Sep 2007 10:12:43 +0000 (UTC)

Author: nextgens
Date: 2007-09-23 10:12:43 +0000 (Sun, 23 Sep 2007)
New Revision: 15284


Modified:
   branches/freenet-jfk/src/freenet/clients/http/filter/CSSReadFilter.java
   branches/freenet-jfk/src/freenet/clients/http/filter/CSSTokenizerFilter.jflex
   branches/freenet-jfk/src/freenet/clients/http/filter/FilterCallback.java
   branches/freenet-jfk/src/freenet/clients/http/filter/FoundURICallback.java
   branches/freenet-jfk/src/freenet/clients/http/filter/HTMLFilter.java
   
branches/freenet-jfk/src/freenet/clients/http/filter/UnknownCharsetException.java
Log:
freenet-jfk: for some reason some files have been forgotten during the merge 
before the last... this commit fixes it

Modified: 
branches/freenet-jfk/src/freenet/clients/http/filter/CSSReadFilter.java
===================================================================
--- branches/freenet-jfk/src/freenet/clients/http/filter/CSSReadFilter.java     
2007-09-23 10:06:39 UTC (rev 15283)
+++ branches/freenet-jfk/src/freenet/clients/http/filter/CSSReadFilter.java     
2007-09-23 10:12:43 UTC (rev 15284)
@@ -15,9 +15,6 @@
 import java.io.Writer;
 import java.util.HashMap;

-import freenet.l10n.L10n;
-import freenet.support.HTMLEncoder;
-import freenet.support.HTMLNode;
 import freenet.support.Logger;
 import freenet.support.api.Bucket;
 import freenet.support.api.BucketFactory;

Modified: 
branches/freenet-jfk/src/freenet/clients/http/filter/CSSTokenizerFilter.jflex
===================================================================
--- 
branches/freenet-jfk/src/freenet/clients/http/filter/CSSTokenizerFilter.jflex   
    2007-09-23 10:06:39 UTC (rev 15283)
+++ 
branches/freenet-jfk/src/freenet/clients/http/filter/CSSTokenizerFilter.jflex   
    2007-09-23 10:12:43 UTC (rev 15284)
@@ -239,7 +239,7 @@
 IDENT={NMSTART}{NMCHAR}*
 UNOFFICIAL_IDENT="-"{IDENT}
 NAME={NMCHAR}+
-NUM=[0-9]+|[0-9]*"."[0-9]+
+NUM=(-){0,1}([0-9]+|[0-9]*"."[0-9]+)
 STRING={STRING1}|{STRING2}

 // Not used any more. Was used in url(). Keep for now. Matches up to the end 
of a bracket.
@@ -391,14 +391,6 @@
        w.write(s);
        if(debug) log("Matched ident: "+s);
 }
-{UNOFFICIAL_IDENT} {
-       if(debug) log("Deleted unofficial ident: "+yytext());
-       w.write("/* " + l10n("deletedUnofficialIdent") + " */");
-}
-{UNOFFICIAL_IDENT}{W}":"{W}{REALURL} {
-       if(debug) log("Deleted unofficial ident with url: "+yytext());
-       w.write("/* " + l10n("deletedUnofficialIdentWithURL") + " */");
-}
 "@page" {
        String s = yytext();
        w.write(s);
@@ -445,7 +437,6 @@
        w.write(s);
        if(debug) log("Matched number: "+s);
 }
-
 {MEDIUMS}{W}*";" {
        if(postBadImportFlag) {
                // Ignore
@@ -458,7 +449,6 @@
                if(debug) log("Matched and passing on mediums list: "+s);
        }
 }
-
 "@charset"{W}*{STRING}{W}*";" {
        String s = yytext();
        detectedCharset = s;
@@ -511,6 +501,14 @@
                // Ignore
        }
 }
+{UNOFFICIAL_IDENT} {
+       if(debug) log("Deleted unofficial ident: "+yytext());
+       w.write("/* " + l10n("deletedUnofficialIdent") + " */");
+}
+{UNOFFICIAL_IDENT}{W}":"{W}{REALURL} {
+       if(debug) log("Deleted unofficial ident with url: "+yytext());
+       w.write("/* " + l10n("deletedUnofficialIdentWithURL") + " */");
+}
 // Default rule matches only one character
 . {
        String s = yytext();

Modified: 
branches/freenet-jfk/src/freenet/clients/http/filter/FilterCallback.java
===================================================================
--- branches/freenet-jfk/src/freenet/clients/http/filter/FilterCallback.java    
2007-09-23 10:06:39 UTC (rev 15283)
+++ branches/freenet-jfk/src/freenet/clients/http/filter/FilterCallback.java    
2007-09-23 10:12:43 UTC (rev 15284)
@@ -23,7 +23,11 @@
        /**
         * Process plain-text. Notification only; can't modify.
         * Type can be null, or can correspond, for example to HTML tag name 
around text
-        *    (for example: "title")
+        * (for example: "title").
+        *    
+        * Note that the string will have been fed through the relevant decoder 
if 
+        * necessary (e.g. HTMLDecoder). It must be re-encoded if it is sent 
out as
+        * text to a browser.
         */
        public void onText(String s, String type);


Modified: 
branches/freenet-jfk/src/freenet/clients/http/filter/FoundURICallback.java
===================================================================
--- branches/freenet-jfk/src/freenet/clients/http/filter/FoundURICallback.java  
2007-09-23 10:06:39 UTC (rev 15283)
+++ branches/freenet-jfk/src/freenet/clients/http/filter/FoundURICallback.java  
2007-09-23 10:12:43 UTC (rev 15284)
@@ -9,11 +9,25 @@

 public interface FoundURICallback {

+       /**
+        * Called when a Freenet URI is found.
+        * @param uri The URI.
+        * FIXME: Indicate the type of the link e.g. inline image, hyperlink, 
etc??
+        */
        public void foundURI(FreenetURI uri);

-       /* type can be null */
-       /* but type can also be, for example, HTML tag name around text */
-       /* Usefull to find things like titles */
-       public void onText(String s, String type, URI baseURI);
+       /**
+        * Called when some plain text is processed. This is used typically by
+        * spiders to index pages by their content.
+        * @param text The text. Will already have been fed through whatever 
decoding
+        * is necessary depending on the type of the source document e.g. 
HTMLDecoder.
+        * Will need to be re-encoded before being sent to e.g. a browser.
+        * @param type Can be null, or may be for example the name of the HTML 
tag
+        * directly surrounding the text. E.g. "title" lets you find page 
titles.
+        * @param baseURI The current base URI for this page. The base URI is 
not
+        * necessarily the URI of the page. It's the URI against which URIs on 
the
+        * page are resolved. It defaults to the URI of the page but can be 
overridden
+        * by base href in html, for example.    */
+       public void onText(String text, String type, URI baseURI);

 }

Modified: branches/freenet-jfk/src/freenet/clients/http/filter/HTMLFilter.java
===================================================================
--- branches/freenet-jfk/src/freenet/clients/http/filter/HTMLFilter.java        
2007-09-23 10:06:39 UTC (rev 15283)
+++ branches/freenet-jfk/src/freenet/clients/http/filter/HTMLFilter.java        
2007-09-23 10:12:43 UTC (rev 15284)
@@ -36,12 +36,14 @@
 public class HTMLFilter implements ContentDataFilter, CharsetExtractor {

        private static boolean logMINOR;
+       private static boolean logDEBUG;

        private static boolean deleteWierdStuff = true;
        private static boolean deleteErrors = true;

        public Bucket readFilter(Bucket bucket, BucketFactory bf, String 
charset, HashMap otherParams, FilterCallback cb) throws DataFilterException, 
IOException {
                logMINOR = Logger.shouldLog(Logger.MINOR, this);
+               logDEBUG = Logger.shouldLog(Logger.DEBUG, this);
                if(logMINOR) Logger.minor(this, "readFilter(): 
charset="+charset);
                InputStream strm = bucket.getInputStream();
                BufferedInputStream bis = new BufferedInputStream(strm, 4096);
@@ -80,14 +82,28 @@
                BufferedInputStream bis = new BufferedInputStream(strm, 4096);
                Writer w = new NullWriter();
                Reader r;
-               r = new BufferedReader(new InputStreamReader(bis, 
parseCharset), 4096);
+               try {
+                       r = new BufferedReader(new InputStreamReader(bis, 
parseCharset), 4096);
+               } catch (UnsupportedEncodingException e) {
+                       strm.close();
+                       throw e;
+               }
                HTMLParseContext pc = new HTMLParseContext(r, w, null, new 
NullFilterCallback(), true);
                try {
                        pc.run(null);
+               } catch (IOException e) {
+                       throw e;
                } catch (Throwable t) {
                        // Ignore ALL errors
                        if(logMINOR) Logger.minor(this, "Caught "+t+" trying to 
detect MIME type with "+parseCharset);
                }
+               try {
+                       r.close();
+               } catch (IOException e) {
+                       throw e;
+               } catch (Throwable t) {
+                       if(logMINOR) Logger.minor(this, "Caught "+t+" closing 
stream after trying to detect MIME type with "+parseCharset);
+               }
                if(logMINOR) Logger.minor(this, "Returning charset 
"+pc.detectedCharset);
                return pc.detectedCharset;
        }
@@ -324,7 +340,7 @@

                if(pc.noOutput) return;

-               if(logMINOR) Logger.minor(this, "Saving text: "+s.toString());
+               if(logDEBUG) Logger.debug(this, "Saving text: "+s.toString());
                if (pc.killText) {
                        return;
                }
@@ -355,7 +371,7 @@
                }
                String sout = out.toString();
                if(pc.cb != null)
-                       pc.cb.onText(sout, tagName); /* Tag name is given as 
type for the text */
+                       pc.cb.onText(HTMLDecoder.decode(sout), tagName); /* Tag 
name is given as type for the text */

                w.write(sout);
        }
@@ -363,9 +379,9 @@
        void processTag(Vector splitTag, Writer w, HTMLParseContext pc)
                throws IOException, DataFilterException {
                // First, check that it is a recognized tag
-               if(logMINOR) {
+               if(logDEBUG) {
                        for(int i=0;i<splitTag.size();i++)
-                               Logger.minor(this, 
"Tag["+i+"]="+splitTag.get(i));
+                               Logger.debug(this, 
"Tag["+i+"]="+splitTag.get(i));
                }
                ParsedTag t = new ParsedTag(splitTag);
                if (!pc.killTag) {
@@ -404,7 +420,7 @@
                        if(s.charAt(s.length()-1) == '-')
                                s.setLength(s.length()-1);
                }
-               if(logMINOR) Logger.minor(this, "Saving comment: 
"+s.toString());
+               if(logDEBUG) Logger.debug(this, "Saving comment: 
"+s.toString());
                if (pc.expectingBadComment)
                        return; // ignore it

@@ -486,13 +502,13 @@
                                        unparsedAttrs[x - 1] = (String) 
v.elementAt(x);
                        } else
                                unparsedAttrs = new String[0];
-                       if(logMINOR) Logger.minor(this, "Element = "+element);
+                       if(logDEBUG) Logger.debug(this, "Element = "+element);
                }

                public ParsedTag sanitize(HTMLParseContext pc) throws 
DataFilterException {
                        TagVerifier tv =
                                (TagVerifier) 
allowedTagsVerifiers.get(element.toLowerCase());
-                       if(logMINOR) Logger.minor(this, "Got verifier: "+tv+" 
for "+element);
+                       if(logDEBUG) Logger.debug(this, "Got verifier: "+tv+" 
for "+element);
                        if (tv == null) {
                                if (deleteWierdStuff) {
                                        return null;
@@ -596,7 +612,8 @@
                                "u",
                                "noframes",
                                "fieldset",
-                               "noscript",
+// Delete <noscript> / </noscript>. So we can at least see the non-scripting 
code.
+//                             "noscript",
                                "xmp",
                                "listing",
                                "plaintext",
@@ -1224,7 +1241,7 @@
                        Hashtable h,
                        Hashtable hn,
                        HTMLParseContext pc) throws DataFilterException {
-                       if(logMINOR) Logger.minor(this, "Finishing 
script/style");
+                       if(logDEBUG) Logger.debug(this, "Finishing 
script/style");
                        // Finishing
                        setStyle(false, pc);
                        pc.styleScriptRecurseCount--;
@@ -1249,7 +1266,7 @@
                }

                Hashtable start(Hashtable h, Hashtable hn, HTMLParseContext pc) 
throws DataFilterException {
-                       if(logMINOR) Logger.minor(this, "Starting 
script/style");
+                       if(logDEBUG) Logger.debug(this, "Starting 
script/style");
                        pc.styleScriptRecurseCount++;
                        if (pc.styleScriptRecurseCount > 1) {
                                if (deleteErrors)
@@ -1481,7 +1498,7 @@
                                type = typesplit[0];
                                if ((typesplit[1] != null) && 
(typesplit[1].length() > 0))
                                        charset = typesplit[1];
-                               if(logMINOR)
+                               if(logDEBUG)
                                        Logger.debug(
                                                        this,
                                                        "Processing link tag, 
type="
@@ -1676,9 +1693,9 @@
                                        } else if 
(http_equiv.equalsIgnoreCase("Content-Type")) {
                                                if(logMINOR) Logger.minor(this, 
"Found http-equiv content-type="+content);
                                                String[] typesplit = 
splitType(content);
-                                               if(logMINOR) {
+                                               if(logDEBUG) {
                                                        for(int 
i=0;i<typesplit.length;i++)
-                                                               
Logger.minor(this, "["+i+"] = "+typesplit[i]);
+                                                               
Logger.debug(this, "["+i+"] = "+typesplit[i]);
                                                }
                                                if 
(typesplit[0].equalsIgnoreCase("text/html")
                                                        && ((typesplit[1] == 
null)
@@ -1846,7 +1863,7 @@
                if ((s == null) || (s.length() == 0))
                        return null;
                //              Core.logger.log(SaferFilter.class, "Style now: 
" + s, Logger.DEBUG);
-               if(logMINOR) Logger.debug(HTMLFilter.class, "Style finally: " + 
s);
+               if(logMINOR) Logger.minor(HTMLFilter.class, "Style finally: " + 
s);
                return s;
        }


Modified: 
branches/freenet-jfk/src/freenet/clients/http/filter/UnknownCharsetException.java
===================================================================
--- 
branches/freenet-jfk/src/freenet/clients/http/filter/UnknownCharsetException.java
   2007-09-23 10:06:39 UTC (rev 15283)
+++ 
branches/freenet-jfk/src/freenet/clients/http/filter/UnknownCharsetException.java
   2007-09-23 10:12:43 UTC (rev 15284)
@@ -6,7 +6,7 @@
 import freenet.support.HTMLNode;

 public class UnknownCharsetException extends DataFilterException {
-
+       private static final long serialVersionUID = 1L;
        public final String charset;

        private UnknownCharsetException(String warning, String warning2, String 
string, HTMLNode explanation, String charset) {

[freenet-cvs] r15284 - branches/freenet-jfk/src/freenet/clients/http/filter

Reply via email to