>From e997e0e63f4a28518747814c14d3c0785516790d Mon Sep 17 00:00:00 2001
From: Jonathan Bakke <[email protected]>
Date: Sun, 23 May 2021 18:28:23 -0700
Subject: [PATCH] Surf content filtering; beta demo.
Implements WebKit content filtering: the ContentFilter
parameter sets the importing of rules from a JSON file,
adding them to the user content manager, and removing
them. Currently lacking is any way to change this
parameter interactively; intended additions include
toggling in the UI and autodetection of file changes.
In separate files are a shell script that creates JSON
rules and a demo configuration file for that script.
These are entirely independent of surf but were written
to make the above changes to surf accessible.
---
config.def.h | 3 +
mkwkcf.demo.rules | 17 +++++
mkwkcf.sh | 176 ++++++++++++++++++++++++++++++++++++++++++++++
surf.1 | 8 ++-
surf.c | 102 ++++++++++++++++++++++++++-
5 files changed, 304 insertions(+), 2 deletions(-)
create mode 100644 mkwkcf.demo.rules
create mode 100755 mkwkcf.sh
diff --git a/config.def.h b/config.def.h
index ef44721..c7a64c2 100644
--- a/config.def.h
+++ b/config.def.h
@@ -6,6 +6,8 @@ static char *styledir = "~/.surf/styles/";
static char *certdir = "~/.surf/certificates/";
static char *cachedir = "~/.surf/cache/";
static char *cookiefile = "~/.surf/cookies.txt";
+static char *filterfile = "~/.surf/filters.json";
+static char *filterdir = "~/.surf/filtmp/";
/* Webkit default features */
/* Highest priority value will be used.
@@ -19,6 +21,7 @@ static Parameter defconfig[ParameterLast] = {
[AccessWebcam] = { { .i = 0 }, },
[Certificate] = { { .i = 0 }, },
[CaretBrowsing] = { { .i = 0 }, },
+ [ContentFilter] = { { .i = 1 }, },
[CookiePolicies] = { { .v = "@Aa" }, },
[DefaultCharset] = { { .v = "UTF-8" }, },
[DiskCache] = { { .i = 1 }, },
diff --git a/mkwkcf.demo.rules b/mkwkcf.demo.rules
new file mode 100644
index 0000000..bad5b26
--- /dev/null
+++ b/mkwkcf.demo.rules
@@ -0,0 +1,17 @@
+# [top uri] [request uri] [1st/3rd party] [block/reset] [content type]
+
+# block everything
+. . . . .
+# allow all sites to load first-party HTML and CSS
+. . 1 reset doc`css
+# allow images and videos when visiting surf.suckless.org
+^https?://surf.suckless.org/ . . reset img`vid
+# block all connections matching evil.com/ when visiting any site
+. evil.com/ . . .
+# allow anything to happen on lazy.days
+^https?://.*lazy.days/ . . reset .
+# ...except third-party scripts and third-party raw data requests
+^https?://.*lazy.days/ . 3 . scr`raw
+# allow .local and .gov to load anything from their own domains
+^https?://.*local/`^https:?//.*gov/ . 1 reset .
+surf.suck . . reset .
diff --git a/mkwkcf.sh b/mkwkcf.sh
new file mode 100755
index 0000000..14eb955
--- /dev/null
+++ b/mkwkcf.sh
@@ -0,0 +1,176 @@
+#!/bin/sh
+
+# generates WebKit content filter rules JSON file
+
+# copyright 2021 Jonathan Bakke; MIT/X Consortium license terms apply
+# this is new, untested, and unproven software. use at your own risk.
+
+# configuration
+newfile="$HOME/.surf/filters.json"
+tmpfile="$HOME/.surf/filters.tmp"
+unit='`'
+
+# docs
+usage() {
+ echo "Usage: $(basename "$0") [filename]" >&2
+ echo "This script translates a basic content filter configuration
format into a" >&2
+ echo "WebKit content filter JSON rules file. The input format has one
rule per" >&2
+ echo "line, with five arguments required per rule:" >&2
+ echo " top-url request-url party action content-type" >&2
+ echo "Any argument may be '.' to include everything, or:" >&2
+ echo " top-url is a regex that applies to the urlbar address" >&2
+ echo " request-url is a regex that applies to each requested
connection" >&2
+ echo " party specifies first- [1*|f*] or third- [3*|t*] party
requests" >&2
+ echo " action is to either 'block' or 'reset' matching request rules,
and" >&2
+ echo " content-type may be: 'doc' for documents (e.g., HTML), 'css'
for" >&2
+ echo " style sheets, 'fnt' for fonts, 'img' for images, 'vid' for
media" >&2
+ echo " (e.g., video), 'scr' for scripts, 'raw' for untyped loads."
>&2
+ echo " Note that this only applies to network requests, JavaScript
in" >&2
+ echo " an allowed HTML doc will load." >&2
+ echo "top-url and content-type arguments may be combined using
\"$unit\" as a" >&2
+ echo "separator; e.g., \"site1\`site2 . . . img\`vid\"." >&2
+ echo "Regular expressions appear broken in WebKit. '.' will match any
char and" >&2
+ echo "'.*' will match any string, but '\\.' causes unexpected
behavior." >&2
+ echo "Do not use anything more complex than '^https?://.*site.com/',
and" >&2
+ echo "know that this will match 'http://sitexcom'." >&2.
+ #echo "Regular expressions are limited to '.', '?', '+', '*', '()', and
'[]'," >&2
+ #echo "and spaces need to be translated (e.g., '%20') before entry." >&2
+ echo "Blank lines and those beginning with '#' are ignored." >&2
+ echo "The rules file may be piped to $(basename "$0") instead of
providing a file." >&2
+ echo "The output JSON file is specified at the top of this script." >&2
+}
+
+err_party() {
+ echo "Unrecognized argument for party:" >&2
+ echo "\"$party\" in $topurl $request $party $action $type" >&2
+ echo "Accepted arguments begin with '1', 'f', '3' or 't'," >&2
+ echo "or '.', '*', 'any' or 'all' to apply to both kinds." >&2
+ echo "Applying rule to both first- and third- party requests." >&2
+}
+
+err_type() {
+ echo "Unrecognized argument for type:" >&2
+ echo "\"$line\" in $topurl $request $party $action $type" >&2
+ echo "Accepted arguments include '.' or" >&2
+ echo "[doc|img|css|scr|fnt|raw|vid|pop], or" >&2
+ echo "a combination of grave-separated values, e.g. 'scr\`raw'" >&2
+}
+
+# check for depedencies, sanity, and input
+# (developed using dash and jo from Debian buster/stable;
+# lightly tested with bash, mksh, and ksh93; OpenBSD testing pending)
+if [ ! "$(command -v jo)" ]; then
+ echo "The 'jo' utility is required. Check your package manager or" >&2
+ echo "https://github.com/jpmens/jo" >&2
+ return 1
+fi
+[ -f "$tmpfile" ] && { rm "$tmpfile" || return 1; }
+[ -f "$newfile" ] && { mv "$newfile" "${newfile}~" || return 1; }
+[ -w "$(dirname "$tmpfile")" ] || { echo "Cannot write to $tmpfile" >&2;
return 1; }
+[ -w "$(dirname "$newfile")" ] || { echo "Cannot write to $newfile" >&2;
return 1; }
+
+[ -p /dev/fd/0 ] && input="$input /dev/fd/0"
+for arg; do
+ [ -f "$arg" ] && input="$input $arg"
+done
+[ -z "$input" ] && { usage; return 1; }
+
+# internal processing functions
+parse_topurl() {
+ case "$topurl" in
+ '.'|'*') topurl_s="";;
+ *) topurl_s="$(echo "$topurl" | tr $unit '\n')";;
+ esac
+}
+
+parse_party() {
+ case $party in
+ '.'|'*'|'any'|'all') party_s="";;
+ 1*|f*) party_s="first-party" ;;
+ 3*|t*) party_s="third-party" ;;
+ *)
+ err_party
+ party_s=""
+ return 1
+ ;;
+ esac
+}
+
+parse_type() {
+ case $type in
+ '.'|'*'|all) type_s=""; return;;
+ esac
+
+ type_s="$(echo "$type" | tr $unit '\n' | while read -r line; do
+ case $line in
+ doc) item=document;;
+ img) item=image;;
+ css) item=style-sheet;;
+ scr) item=script;;
+ fnt) item=font;;
+ raw) item=raw;;
+ vid|med) item=media;;
+ pop) item=popup;;
+ *)
+ err_type
+ return 1
+ ;;
+ esac
+ echo "$item "
+ done)"
+}
+
+parse_action() {
+ case $action in
+ reset|ignore) action_s="ignore-previous-rules";;
+ *) action_s="block";;
+ esac
+}
+
+clear() {
+ unset topurl
+ unset request
+ unset party
+ unset action
+ unset type
+ unset excess
+}
+
+# "main()"
+# get and split argument values from each line
+# ($input may include raw rules and multiple filenames; do not quote)
+cat $input | while read -r topurl request party action type excess; do
+ # guards: blank line, comment, and initial malformation check
+ [ -z "$topurl" ] && { clear; continue; }
+ [ 0 -lt "$(expr "$topurl" : '[ \t]*#')" ] && { clear; continue; }
+ if [ -z "$type" ] || [ -n "$excess" ]; then
+ echo "Problematic rule identified:" >&2
+ echo " $topurl $request $party $action $type" >&2
+ return 1
+ fi
+ # none of these variables are nil or contain an IFS
+
+ # translate config-format args into WebKit-format args
+ parse_topurl
+ parse_party
+ parse_type
+ parse_action
+
+ # collect args and form one JSON element
+ # (vars include multiple values to be split for jo; do not quote)
+ request_s="url-filter=$request"
+ [ -n "$topurl_s" ] && topurl_s="if-top-url=$(jo -a $topurl_s)"
+ [ -n "$party_s" ] && party_s="load-type=$(jo -a $party_s)"
+ [ -n "$type_s" ] && type_s="resource-type=$(jo -a $type_s)"
+ action_s="action=$(jo type=$action_s)"
+ # the subshell command here requires quoting to avoid expansion
+ # in ksh93 (mksh also accepts set +o braceexpand)
+ jo trigger="$(jo $request_s $topurl_s $party_s $type_s)" \
+ $action_s >> "$tmpfile"
+
+ # prevent old vars from making bad rules appear complete
+ clear
+done || return 1
+
+# collate into one JSON array
+jo -a < "$tmpfile" > "$newfile"
diff --git a/surf.1 b/surf.1
index 496afb9..9b69f0b 100644
--- a/surf.1
+++ b/surf.1
@@ -3,7 +3,7 @@
surf \- simple webkit-based browser
.SH SYNOPSIS
.B surf
-.RB [-bBdDfFgGiIkKmMnNpPsStTvwxX]
+.RB [-bBdDfFgGiIkKmMnNpPsStTvwxXyY]
.RB [-a\ cookiepolicies]
.RB [-c\ cookiefile]
.RB [-C\ stylefile]
@@ -126,6 +126,12 @@ Disable custom certificates.
.B -X
Enable custom certificates.
.TP
+.B -y
+Disable content filtering.
+.TP
+.B -Y
+Enable content filtering.
+.TP
.B \-z zoomlevel
Specify the
.I zoomlevel
diff --git a/surf.c b/surf.c
index c25def7..642d98a 100644
--- a/surf.c
+++ b/surf.c
@@ -55,6 +55,7 @@ typedef enum {
AccessWebcam,
CaretBrowsing,
Certificate,
+ ContentFilter,
CookiePolicies,
DiskCache,
DefaultCharset,
@@ -86,6 +87,14 @@ typedef enum {
ParameterLast
} ParamName;
+typedef enum {
+ DoRemoveFilters = -1,
+ NoFiltersAreActive,
+ DoActivateFilters,
+ FiltersAreLoadingAsync,
+ FiltersAreActive
+} ContentFilterState;
+
typedef union {
int i;
float f;
@@ -168,6 +177,8 @@ static const char *getcert(const char *uri);
static void setcert(Client *c, const char *file);
static const char *getstyle(const char *uri);
static void setstyle(Client *c, const char *file);
+static int cfactivate(Client *c, const Arg *a);
+static void cfactivatecb(GObject *src_obj, GAsyncResult *res, gpointer data);
static void runscript(Client *c);
static void evalscript(Client *c, const char *jsstr, ...);
static void updatewinid(Client *c);
@@ -250,6 +261,9 @@ static Display *dpy;
static Client *clients;
static GdkDevice *gdkkb;
static char *stylefile;
+static char *filterfile;
+static WebKitUserContentFilter *filter;
+static WebKitUserContentFilterStore *filterstore;
static const char *useragent;
static Parameter *curconfig;
static int modparams[ParameterLast];
@@ -274,6 +288,7 @@ static ParamName loadcommitted[] = {
// AccessMicrophone,
// AccessWebcam,
CaretBrowsing,
+ ContentFilter,
DefaultCharset,
FontSize,
FrameFlattening,
@@ -315,7 +330,7 @@ die(const char *errstr, ...)
void
usage(void)
{
- die("usage: surf [-bBdDfFgGiIkKmMnNpPsStTvwxX]\n"
+ die("usage: surf [-bBdDfFgGiIkKmMnNpPsStTvwxXyY]\n"
"[-a cookiepolicies ] [-c cookiefile] [-C stylefile] [-e xid]\n"
"[-r scriptfile] [-u useragent] [-z zoomlevel] [uri]\n");
}
@@ -349,6 +364,8 @@ setup(void)
/* dirs and files */
cookiefile = buildfile(cookiefile);
scriptfile = buildfile(scriptfile);
+ filterfile = buildfile(filterfile);
+ filterdir = buildpath(filterdir);
certdir = buildpath(certdir);
if (curconfig[Ephemeral].val.i)
cachedir = NULL;
@@ -735,6 +752,7 @@ seturiparameters(Client *c, const char *uri, ParamName
*params)
defconfig[p].prio < modparams[p]))
continue;
case Certificate:
+ case ContentFilter:
case CookiePolicies:
case Style:
setparameter(c, 0, p, &curconfig[p].val);
@@ -763,6 +781,26 @@ setparameter(Client *c, int refresh, ParamName p, const
Arg *a)
if (a->i)
setcert(c, geturi(c));
return; /* do not update */
+ case ContentFilter:
+ switch (a->i) {
+ case NoFiltersAreActive: /* fallthrough */
+ case FiltersAreActive: /* fallthrough */
+ case FiltersAreLoadingAsync:
+ return; /* do nothing */
+ case DoRemoveFilters:
+ webkit_user_content_manager_remove_all_filters(
+ webkit_web_view_get_user_content_manager(
+ c->view));
+ curconfig[p].val.i = NoFiltersAreActive;
+ refresh = 1;
+ break;
+ case DoActivateFilters:
+ refresh = cfactivate(c, a);
+ break;
+ default:
+ return; /* do nothing */
+ }
+ break;
case CookiePolicies:
webkit_cookie_manager_set_accept_policy(
webkit_web_context_get_cookie_manager(
@@ -948,6 +986,60 @@ setstyle(Client *c, const char *file)
g_free(style);
}
+int
+cfactivate(Client *c, const Arg *a)
+{
+ gchar *filetext;
+ gsize filesize;
+ static const gchar *cfid = "id";
+ GBytes *cfbytes;
+ GError *err = NULL;
+
+ if (filter != NULL) {
+ webkit_user_content_manager_add_filter(
+ webkit_web_view_get_user_content_manager(c->view),
+ filter);
+ curconfig[ContentFilter].val.i = FiltersAreActive;
+ return 1;
+ }
+ if (filterfile == NULL || filterdir == NULL)
+ return 0;
+ if (!g_file_get_contents(filterfile, &filetext, &filesize, &err)) {
+ fprintf(stderr, "Error when accessing: %s\n\t%s\n",
+ filterfile, err->message);
+ return 0;
+ }
+ filterstore = webkit_user_content_filter_store_new(filterdir);
+ cfbytes = g_bytes_new(filetext, filesize);
+ webkit_user_content_filter_store_save(filterstore, cfid, cfbytes,
+ NULL, cfactivatecb, c);
+ curconfig[ContentFilter].val.i = FiltersAreLoadingAsync;
+ g_free(filetext);
+ return 0;
+}
+
+void
+cfactivatecb(GObject *src_obj, GAsyncResult *res, gpointer data)
+{
+ Client *c = data;
+ Arg a = {.i = 1};
+ GError *err = NULL;
+ filter = webkit_user_content_filter_store_save_finish(
+ filterstore, res, &err);
+ if (err != NULL) {
+ fprintf(stderr, "Error generating content filter: %s\n",
+ err->message);
+ filter = NULL;
+ curconfig[ContentFilter].val.i = NoFiltersAreActive;
+ return;
+ }
+ webkit_user_content_manager_add_filter(
+ webkit_web_view_get_user_content_manager(c->view),
+ filter);
+ curconfig[ContentFilter].val.i = FiltersAreActive;
+ reload(c, &a);
+}
+
void
runscript(Client *c)
{
@@ -2106,6 +2198,14 @@ main(int argc, char *argv[])
defconfig[Certificate].val.i = 1;
defconfig[Certificate].prio = 2;
break;
+ case 'y':
+ defconfig[ContentFilter].val.i = 0;
+ defconfig[ContentFilter].prio = 2;
+ break;
+ case 'Y':
+ defconfig[ContentFilter].val.i = 1;
+ defconfig[ContentFilter].prio = 2;
+ break;
case 'z':
defconfig[ZoomLevel].val.f = strtof(EARGF(usage()), NULL);
defconfig[ZoomLevel].prio = 2;
--
2.20.1