>From e997e0e63f4a28518747814c14d3c0785516790d Mon Sep 17 00:00:00 2001
From: Jonathan Bakke <[email protected]>
Date: Sun, 23 May 2021 18:28:23 -0700
Subject: [PATCH] Surf content filtering; beta demo.

Implements WebKit content filtering: the ContentFilter
parameter sets the importing of rules from a JSON file,
adding them to the user content manager, and removing
them. Currently lacking is any way to change this
parameter interactively; intended additions include
toggling in the UI and autodetection of file changes.

In separate files are a shell script that creates JSON
rules and a demo configuration file for that script.
These are entirely independent of surf but were written
to make the above changes to surf accessible.
---
 config.def.h      |   3 +
 mkwkcf.demo.rules |  17 +++++
 mkwkcf.sh         | 176 ++++++++++++++++++++++++++++++++++++++++++++++
 surf.1            |   8 ++-
 surf.c            | 102 ++++++++++++++++++++++++++-
 5 files changed, 304 insertions(+), 2 deletions(-)
 create mode 100644 mkwkcf.demo.rules
 create mode 100755 mkwkcf.sh

diff --git a/config.def.h b/config.def.h
index ef44721..c7a64c2 100644
--- a/config.def.h
+++ b/config.def.h
@@ -6,6 +6,8 @@ static char *styledir       = "~/.surf/styles/";
 static char *certdir        = "~/.surf/certificates/";
 static char *cachedir       = "~/.surf/cache/";
 static char *cookiefile     = "~/.surf/cookies.txt";
+static char *filterfile     = "~/.surf/filters.json";
+static char *filterdir      = "~/.surf/filtmp/";

 /* Webkit default features */
 /* Highest priority value will be used.
@@ -19,6 +21,7 @@ static Parameter defconfig[ParameterLast] = {
        [AccessWebcam]        =       { { .i = 0 },     },
        [Certificate]         =       { { .i = 0 },     },
        [CaretBrowsing]       =       { { .i = 0 },     },
+       [ContentFilter]       =       { { .i = 1 },     },
        [CookiePolicies]      =       { { .v = "@Aa" }, },
        [DefaultCharset]      =       { { .v = "UTF-8" }, },
        [DiskCache]           =       { { .i = 1 },     },
diff --git a/mkwkcf.demo.rules b/mkwkcf.demo.rules
new file mode 100644
index 0000000..bad5b26
--- /dev/null
+++ b/mkwkcf.demo.rules
@@ -0,0 +1,17 @@
+# [top uri] [request uri] [1st/3rd party] [block/reset] [content type]
+
+# block everything
+. . . . .
+# allow all sites to load first-party HTML and CSS
+. . 1 reset doc`css
+# allow images and videos when visiting surf.suckless.org
+^https?://surf.suckless.org/ . . reset img`vid
+# block all connections matching evil.com/ when visiting any site
+. evil.com/ . . .
+# allow anything to happen on lazy.days
+^https?://.*lazy.days/ . . reset .
+# ...except third-party scripts and third-party raw data requests
+^https?://.*lazy.days/ . 3 . scr`raw
+# allow .local and .gov to load anything from their own domains
+^https?://.*local/`^https:?//.*gov/ . 1 reset .
+surf.suck . . reset .
diff --git a/mkwkcf.sh b/mkwkcf.sh
new file mode 100755
index 0000000..14eb955
--- /dev/null
+++ b/mkwkcf.sh
@@ -0,0 +1,176 @@
+#!/bin/sh
+
+# generates WebKit content filter rules JSON file
+
+# copyright 2021 Jonathan Bakke; MIT/X Consortium license terms apply
+# this is new, untested, and unproven software. use at your own risk.
+
+# configuration
+newfile="$HOME/.surf/filters.json"
+tmpfile="$HOME/.surf/filters.tmp"
+unit='`'
+
+# docs
+usage() {
+       echo "Usage: $(basename "$0") [filename]" >&2
+       echo "This script translates a basic content filter configuration 
format into a" >&2
+       echo "WebKit content filter JSON rules file. The input format has one 
rule per" >&2
+       echo "line, with five arguments required per rule:" >&2
+       echo "   top-url request-url party action content-type" >&2
+       echo "Any argument may be '.' to include everything, or:" >&2
+       echo "   top-url is a regex that applies to the urlbar address" >&2
+       echo "   request-url is a regex that applies to each requested 
connection" >&2
+       echo "   party specifies first- [1*|f*] or third- [3*|t*] party 
requests" >&2
+       echo "   action is to either 'block' or 'reset' matching request rules, 
and" >&2
+       echo "   content-type may be: 'doc' for documents (e.g., HTML), 'css' 
for" >&2
+       echo "      style sheets, 'fnt' for fonts, 'img' for images, 'vid' for 
media" >&2
+       echo "      (e.g., video), 'scr' for scripts, 'raw' for untyped loads." 
>&2
+       echo "      Note that this only applies to network requests, JavaScript 
in" >&2
+       echo "      an allowed HTML doc will load." >&2
+       echo "top-url and content-type arguments may be combined using 
\"$unit\" as a" >&2
+       echo "separator; e.g., \"site1\`site2 . . . img\`vid\"." >&2
+       echo "Regular expressions appear broken in WebKit. '.' will match any 
char and" >&2
+       echo "'.*' will match any string, but '\\.' causes unexpected 
behavior." >&2
+       echo "Do not use anything more complex than '^https?://.*site.com/', 
and" >&2
+       echo "know that this will match 'http://sitexcom'." >&2.
+       #echo "Regular expressions are limited to '.', '?', '+', '*', '()', and 
'[]'," >&2
+       #echo "and spaces need to be translated (e.g., '%20') before entry." >&2
+       echo "Blank lines and those beginning with '#' are ignored." >&2
+       echo "The rules file may be piped to $(basename "$0") instead of 
providing a file." >&2
+       echo "The output JSON file is specified at the top of this script." >&2
+}
+
+err_party() {
+       echo "Unrecognized argument for party:" >&2
+       echo "\"$party\" in $topurl $request $party $action $type" >&2
+       echo "Accepted arguments begin with '1', 'f', '3' or 't'," >&2
+       echo "or '.', '*', 'any' or 'all' to apply to both kinds." >&2
+       echo "Applying rule to both first- and third- party requests." >&2
+}
+
+err_type() {
+       echo "Unrecognized argument for type:" >&2
+       echo "\"$line\" in $topurl $request $party $action $type" >&2
+       echo "Accepted arguments include '.' or" >&2
+       echo "[doc|img|css|scr|fnt|raw|vid|pop], or" >&2
+       echo "a combination of grave-separated values, e.g. 'scr\`raw'" >&2
+}
+
+# check for depedencies, sanity, and input
+# (developed using dash and jo from Debian buster/stable;
+#  lightly tested with bash, mksh, and ksh93; OpenBSD testing pending)
+if [ ! "$(command -v jo)" ]; then
+       echo "The 'jo' utility is required. Check your package manager or" >&2
+       echo "https://github.com/jpmens/jo"; >&2
+       return 1
+fi
+[ -f "$tmpfile" ] && { rm "$tmpfile" || return 1; }
+[ -f "$newfile" ] && { mv "$newfile" "${newfile}~" || return 1; }
+[ -w "$(dirname "$tmpfile")" ] || { echo "Cannot write to $tmpfile" >&2; 
return 1; }
+[ -w "$(dirname "$newfile")" ] || { echo "Cannot write to $newfile" >&2; 
return 1; }
+
+[ -p /dev/fd/0 ] && input="$input /dev/fd/0"
+for arg; do
+       [ -f "$arg" ] && input="$input $arg"
+done
+[ -z "$input" ] && { usage; return 1; }
+
+# internal processing functions
+parse_topurl() {
+       case "$topurl" in
+       '.'|'*') topurl_s="";;
+       *) topurl_s="$(echo "$topurl" | tr $unit '\n')";;
+       esac
+}
+
+parse_party() {
+       case $party in
+       '.'|'*'|'any'|'all') party_s="";;
+       1*|f*) party_s="first-party" ;;
+       3*|t*) party_s="third-party" ;;
+       *)
+               err_party
+               party_s=""
+               return 1
+               ;;
+       esac
+}
+
+parse_type() {
+       case $type in
+       '.'|'*'|all) type_s=""; return;;
+       esac
+
+       type_s="$(echo "$type" | tr $unit '\n' | while read -r line; do
+               case $line in
+               doc) item=document;;
+               img) item=image;;
+               css) item=style-sheet;;
+               scr) item=script;;
+               fnt) item=font;;
+               raw) item=raw;;
+               vid|med) item=media;;
+               pop) item=popup;;
+               *)
+                       err_type
+                       return 1
+                       ;;
+               esac
+               echo "$item "
+       done)"
+}
+
+parse_action() {
+       case $action in
+       reset|ignore) action_s="ignore-previous-rules";;
+       *) action_s="block";;
+       esac
+}
+
+clear() {
+       unset topurl
+       unset request
+       unset party
+       unset action
+       unset type
+       unset excess
+}
+
+# "main()"
+# get and split argument values from each line
+# ($input may include raw rules and multiple filenames; do not quote)
+cat $input | while read -r topurl request party action type excess; do
+       # guards: blank line, comment, and initial malformation check
+        [ -z "$topurl" ] && { clear; continue; }
+        [ 0 -lt "$(expr "$topurl" : '[ \t]*#')" ] && { clear; continue; }
+       if [ -z "$type" ] || [ -n "$excess" ]; then
+               echo "Problematic rule identified:" >&2
+               echo "   $topurl $request $party $action $type" >&2
+               return 1
+       fi
+       # none of these variables are nil or contain an IFS
+
+       # translate config-format args into WebKit-format args
+       parse_topurl
+       parse_party
+       parse_type
+       parse_action
+
+       # collect args and form one JSON element
+       # (vars include multiple values to be split for jo; do not quote)
+       request_s="url-filter=$request"
+       [ -n "$topurl_s" ] && topurl_s="if-top-url=$(jo -a $topurl_s)"
+       [ -n "$party_s"  ] && party_s="load-type=$(jo -a $party_s)"
+       [ -n "$type_s"   ] && type_s="resource-type=$(jo -a $type_s)"
+       action_s="action=$(jo type=$action_s)"
+       # the subshell command here requires quoting to avoid expansion
+       # in ksh93 (mksh also accepts set +o braceexpand)
+       jo trigger="$(jo $request_s $topurl_s $party_s $type_s)" \
+               $action_s >> "$tmpfile"
+
+       # prevent old vars from making bad rules appear complete
+       clear
+done || return 1
+
+# collate into one JSON array
+jo -a < "$tmpfile" > "$newfile"
diff --git a/surf.1 b/surf.1
index 496afb9..9b69f0b 100644
--- a/surf.1
+++ b/surf.1
@@ -3,7 +3,7 @@
 surf \- simple webkit-based browser
 .SH SYNOPSIS
 .B surf
-.RB [-bBdDfFgGiIkKmMnNpPsStTvwxX]
+.RB [-bBdDfFgGiIkKmMnNpPsStTvwxXyY]
 .RB [-a\ cookiepolicies]
 .RB [-c\ cookiefile]
 .RB [-C\ stylefile]
@@ -126,6 +126,12 @@ Disable custom certificates.
 .B -X
 Enable custom certificates.
 .TP
+.B -y
+Disable content filtering.
+.TP
+.B -Y
+Enable content filtering.
+.TP
 .B \-z zoomlevel
 Specify the
 .I zoomlevel
diff --git a/surf.c b/surf.c
index c25def7..642d98a 100644
--- a/surf.c
+++ b/surf.c
@@ -55,6 +55,7 @@ typedef enum {
        AccessWebcam,
        CaretBrowsing,
        Certificate,
+       ContentFilter,
        CookiePolicies,
        DiskCache,
        DefaultCharset,
@@ -86,6 +87,14 @@ typedef enum {
        ParameterLast
 } ParamName;

+typedef enum {
+       DoRemoveFilters = -1,
+       NoFiltersAreActive,
+       DoActivateFilters,
+       FiltersAreLoadingAsync,
+       FiltersAreActive
+} ContentFilterState;
+
 typedef union {
        int i;
        float f;
@@ -168,6 +177,8 @@ static const char *getcert(const char *uri);
 static void setcert(Client *c, const char *file);
 static const char *getstyle(const char *uri);
 static void setstyle(Client *c, const char *file);
+static int cfactivate(Client *c, const Arg *a);
+static void cfactivatecb(GObject *src_obj, GAsyncResult *res, gpointer data);
 static void runscript(Client *c);
 static void evalscript(Client *c, const char *jsstr, ...);
 static void updatewinid(Client *c);
@@ -250,6 +261,9 @@ static Display *dpy;
 static Client *clients;
 static GdkDevice *gdkkb;
 static char *stylefile;
+static char *filterfile;
+static WebKitUserContentFilter *filter;
+static WebKitUserContentFilterStore *filterstore;
 static const char *useragent;
 static Parameter *curconfig;
 static int modparams[ParameterLast];
@@ -274,6 +288,7 @@ static ParamName loadcommitted[] = {
 //     AccessMicrophone,
 //     AccessWebcam,
        CaretBrowsing,
+       ContentFilter,
        DefaultCharset,
        FontSize,
        FrameFlattening,
@@ -315,7 +330,7 @@ die(const char *errstr, ...)
 void
 usage(void)
 {
-       die("usage: surf [-bBdDfFgGiIkKmMnNpPsStTvwxX]\n"
+       die("usage: surf [-bBdDfFgGiIkKmMnNpPsStTvwxXyY]\n"
            "[-a cookiepolicies ] [-c cookiefile] [-C stylefile] [-e xid]\n"
            "[-r scriptfile] [-u useragent] [-z zoomlevel] [uri]\n");
 }
@@ -349,6 +364,8 @@ setup(void)
        /* dirs and files */
        cookiefile = buildfile(cookiefile);
        scriptfile = buildfile(scriptfile);
+       filterfile = buildfile(filterfile);
+       filterdir  = buildpath(filterdir);
        certdir    = buildpath(certdir);
        if (curconfig[Ephemeral].val.i)
                cachedir = NULL;
@@ -735,6 +752,7 @@ seturiparameters(Client *c, const char *uri, ParamName 
*params)
                            defconfig[p].prio < modparams[p]))
                                continue;
                case Certificate:
+               case ContentFilter:
                case CookiePolicies:
                case Style:
                        setparameter(c, 0, p, &curconfig[p].val);
@@ -763,6 +781,26 @@ setparameter(Client *c, int refresh, ParamName p, const 
Arg *a)
                if (a->i)
                        setcert(c, geturi(c));
                return; /* do not update */
+       case ContentFilter:
+               switch (a->i) {
+               case NoFiltersAreActive: /* fallthrough */
+               case FiltersAreActive: /* fallthrough */
+               case FiltersAreLoadingAsync:
+                       return; /* do nothing */
+               case DoRemoveFilters:
+                       webkit_user_content_manager_remove_all_filters(
+                           webkit_web_view_get_user_content_manager(
+                           c->view));
+                       curconfig[p].val.i = NoFiltersAreActive;
+                       refresh = 1;
+                       break;
+               case DoActivateFilters:
+                       refresh = cfactivate(c, a);
+                       break;
+               default:
+                       return; /* do nothing */
+               }
+               break;
        case CookiePolicies:
                webkit_cookie_manager_set_accept_policy(
                    webkit_web_context_get_cookie_manager(
@@ -948,6 +986,60 @@ setstyle(Client *c, const char *file)
        g_free(style);
 }

+int
+cfactivate(Client *c, const Arg *a)
+{
+       gchar *filetext;
+       gsize filesize;
+       static const gchar *cfid = "id";
+       GBytes *cfbytes;
+       GError *err = NULL;
+
+       if (filter != NULL) {
+               webkit_user_content_manager_add_filter(
+                   webkit_web_view_get_user_content_manager(c->view),
+                   filter);
+               curconfig[ContentFilter].val.i = FiltersAreActive;
+               return 1;
+       }
+       if (filterfile == NULL || filterdir == NULL)
+               return 0;
+        if (!g_file_get_contents(filterfile, &filetext, &filesize, &err)) {
+                fprintf(stderr, "Error when accessing: %s\n\t%s\n",
+                       filterfile, err->message);
+                return 0;
+        }
+       filterstore = webkit_user_content_filter_store_new(filterdir);
+       cfbytes = g_bytes_new(filetext, filesize);
+       webkit_user_content_filter_store_save(filterstore, cfid, cfbytes,
+           NULL, cfactivatecb, c);
+       curconfig[ContentFilter].val.i = FiltersAreLoadingAsync;
+       g_free(filetext);
+       return 0;
+}
+
+void
+cfactivatecb(GObject *src_obj, GAsyncResult *res, gpointer data)
+{
+       Client *c = data;
+       Arg a = {.i = 1};
+       GError *err = NULL;
+       filter = webkit_user_content_filter_store_save_finish(
+                   filterstore, res, &err);
+       if (err != NULL) {
+               fprintf(stderr, "Error generating content filter: %s\n",
+                       err->message);
+               filter = NULL;
+               curconfig[ContentFilter].val.i = NoFiltersAreActive;
+               return;
+       }
+       webkit_user_content_manager_add_filter(
+           webkit_web_view_get_user_content_manager(c->view),
+           filter);
+       curconfig[ContentFilter].val.i = FiltersAreActive;
+       reload(c, &a);
+}
+
 void
 runscript(Client *c)
 {
@@ -2106,6 +2198,14 @@ main(int argc, char *argv[])
                defconfig[Certificate].val.i = 1;
                defconfig[Certificate].prio = 2;
                break;
+       case 'y':
+               defconfig[ContentFilter].val.i = 0;
+               defconfig[ContentFilter].prio = 2;
+               break;
+       case 'Y':
+               defconfig[ContentFilter].val.i = 1;
+               defconfig[ContentFilter].prio = 2;
+               break;
        case 'z':
                defconfig[ZoomLevel].val.f = strtof(EARGF(usage()), NULL);
                defconfig[ZoomLevel].prio = 2;
--
2.20.1



Reply via email to