JID case sensitivity, again

Alex Vandiver Fri, 25 Sep 2009 11:28:17 -0700

Heya,
  I was cleaning up some local repositories the other day, and noticed
that the JID case sensitivity patches have yet to be applied.  I'd love
to not have to keep a local fork of djabberd[1] in order to fix this
rather easily fixable but fundamentally important issue.  As we've seen
on the lists, this is not purely a theoretical problem -- people are
actually being bitten by it.


The only reason I've heard for not applying them was fears about
backwards incompatability.  With the attached patches, users' options
are:

 * Add 'CaseSensitive on', and nothing is changed.
 * Run 'fix-jabber-roster-case' to fix up SQLite rosters.
 * If you have non-SQLite rosters, write a tool to do the case-folding 
   and merging for your own storage format.

I can't see any way to make this easier than the above, and I really do
want to see this bug put to rest.
 - Alex

[1] http://github.com/bestpractical/djabberd

>From 7234cc3410b9e72dec7e0ff1763aadfc260b0144 Mon Sep 17 00:00:00 2001
From: Alex Vandiver <ale...@bestpractical.com>
Date: Thu, 26 Mar 2009 21:24:28 -0400
Subject: [PATCH 1/5] Do stringprep, as specified by the RFC, on JID object creation

---
 DJabberd/Makefile.PL         |    1 +
 DJabberd/lib/DJabberd/JID.pm |   89 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 89 insertions(+), 1 deletions(-)

diff --git a/DJabberd/Makefile.PL b/DJabberd/Makefile.PL
index 6de3e23..cfc06e2 100644
--- a/DJabberd/Makefile.PL
+++ b/DJabberd/Makefile.PL
@@ -14,6 +14,7 @@ WriteMakefile(
                   'Net::SSLeay'                  => 0,
                   'Log::Log4perl'                => 0,
                   'Digest::HMAC_SHA1'            => 0,
+                  'Unicode::Stringprep'          => 0,
               },
               clean      => { FILES => 't/log/*' },
               AUTHOR     => 'Brad Fitzpatrick <b...@danga.com>',
diff --git a/DJabberd/lib/DJabberd/JID.pm b/DJabberd/lib/DJabberd/JID.pm
index 3959ee5..f37e29c 100644
--- a/DJabberd/lib/DJabberd/JID.pm
+++ b/DJabberd/lib/DJabberd/JID.pm
@@ -13,6 +13,84 @@ use constant AS_STRING  => 3;
 use constant AS_BSTRING => 4;
 use constant AS_STREXML => 5;
 
+# Stringprep functions for converting to canonical form
+use Unicode::Stringprep;
+use Unicode::Stringprep::Mapping;
+use Unicode::Stringprep::Prohibited;
+my $nodeprep = Unicode::Stringprep->new(
+    3.2,
+    [
+        \...@unicode::Stringprep::Mapping::B1,
+        \...@unicode::Stringprep::Mapping::B2,
+    ],
+    'KC',
+    [
+        \...@unicode::Stringprep::Prohibited::C11,
+        \...@unicode::Stringprep::Prohibited::C12,
+        \...@unicode::Stringprep::Prohibited::C21,
+        \...@unicode::Stringprep::Prohibited::C22,
+        \...@unicode::Stringprep::Prohibited::C3,
+        \...@unicode::Stringprep::Prohibited::C4,
+        \...@unicode::Stringprep::Prohibited::C5,
+        \...@unicode::Stringprep::Prohibited::C6,
+        \...@unicode::Stringprep::Prohibited::C7,
+        \...@unicode::Stringprep::Prohibited::C8,
+        \...@unicode::Stringprep::Prohibited::C9,
+        [
+            0x0022, undef, # "
+            0x0026, undef, # &
+            0x0027, undef, # '
+            0x002F, undef, # /
+            0x003A, undef, # :
+            0x003C, undef, # <
+            0x003E, undef, # >
+            0x0040, undef, # @
+        ]
+    ],
+    1,
+);
+my $nameprep = Unicode::Stringprep->new(
+    3.2,
+    [
+        \...@unicode::Stringprep::Mapping::B1,
+        \...@unicode::Stringprep::Mapping::B2,
+    ],
+    'KC',
+    [
+        \...@unicode::Stringprep::Prohibited::C12,
+        \...@unicode::Stringprep::Prohibited::C22,
+        \...@unicode::Stringprep::Prohibited::C3,
+        \...@unicode::Stringprep::Prohibited::C4,
+        \...@unicode::Stringprep::Prohibited::C5,
+        \...@unicode::Stringprep::Prohibited::C6,
+        \...@unicode::Stringprep::Prohibited::C7,
+        \...@unicode::Stringprep::Prohibited::C8,
+        \...@unicode::Stringprep::Prohibited::C9,
+    ],
+    1,
+);
+my $resourceprep = Unicode::Stringprep->new(
+    3.2,
+    [
+        \...@unicode::Stringprep::Mapping::B1,
+    ],
+    'KC',
+    [
+        \...@unicode::Stringprep::Prohibited::C12,
+        \...@unicode::Stringprep::Prohibited::C21,
+        \...@unicode::Stringprep::Prohibited::C22,
+        \...@unicode::Stringprep::Prohibited::C3,
+        \...@unicode::Stringprep::Prohibited::C4,
+        \...@unicode::Stringprep::Prohibited::C5,
+        \...@unicode::Stringprep::Prohibited::C6,
+        \...@unicode::Stringprep::Prohibited::C7,
+        \...@unicode::Stringprep::Prohibited::C8,
+        \...@unicode::Stringprep::Prohibited::C9,
+    ],
+    1,
+);
+
+
 # returns DJabberd::JID object, or undef on failure due to invalid format
 sub new {
     #my ($class, $jidstring) = @_;
@@ -29,7 +107,16 @@ sub new {
            (?: /(.{1,1023})   )?                                           # $3: optional resource
            $!x;
 
-    return bless [ $1, $2, $3 ], $_[0];
+    # Stringprep uses regexes, so store these away first
+    my ($node, $host, $res) = ($1, $2, $3);
+
+    return eval {
+        bless [
+            defined $node ? $nodeprep->($node) : undef,
+            $nameprep->($host),
+            defined $res  ? $resourceprep->($res) : undef,
+        ], $_[0]
+    };
 }
 
 sub is_bare {
-- 
1.6.3.3.473.gb74fc4.dirty

>From 81a71a37d10546f1ecae8dd09e878114ecb45a29 Mon Sep 17 00:00:00 2001
From: Alex Vandiver <ale...@bestpractical.com>
Date: Thu, 9 Apr 2009 13:17:42 -0400
Subject: [PATCH 2/5] Lower-case all domains for checking verification status

---
 DJabberd/lib/DJabberd/Connection/ServerIn.pm |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/DJabberd/lib/DJabberd/Connection/ServerIn.pm b/DJabberd/lib/DJabberd/Connection/ServerIn.pm
index 32c7599..d7ca323 100644
--- a/DJabberd/lib/DJabberd/Connection/ServerIn.pm
+++ b/DJabberd/lib/DJabberd/Connection/ServerIn.pm
@@ -16,7 +16,7 @@ sub set_vhost {
 
 sub peer_domain_is_verified {
     my ($self, $domain) = @_;
-    return $self->{verified_remote_domain}->{$domain};
+    return $self->{verified_remote_domain}->{lc $domain};
 }
 
 sub on_stream_start {
@@ -140,7 +140,7 @@ sub dialback_result_valid {
     my %opts = @_;
 
     my $res = qq{<db:result from='$opts{recv_server}' to='$opts{orig_server}' type='valid'/>};
-    $self->{verified_remote_domain}->{$opts{orig_server}} = $opts{orig_server};
+    $self->{verified_remote_domain}->{lc $opts{orig_server}} = $opts{orig_server};
     $self->log->debug("Dialback result valid for connection $self->{id}.  from=$opts{recv_server}, to=$opts{orig_server}: $res\n");
     $self->write($res);
 }
-- 
1.6.3.3.473.gb74fc4.dirty

>From e327279d56e61e9c84ff393d03c13ae0ee144850 Mon Sep 17 00:00:00 2001
From: Alex Vandiver <ale...@bestpractical.com>
Date: Fri, 3 Apr 2009 17:47:36 -0400
Subject: [PATCH 3/5] Add global CaseSensitive flag for backwards compatibility

---
 DJabberd/lib/DJabberd.pm     |    5 +++++
 DJabberd/lib/DJabberd/JID.pm |    7 +++++++
 2 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/DJabberd/lib/DJabberd.pm b/DJabberd/lib/DJabberd.pm
index 9debd3d..a168fdb 100644
--- a/DJabberd/lib/DJabberd.pm
+++ b/DJabberd/lib/DJabberd.pm
@@ -167,6 +167,11 @@ sub fake_s2s_peer {
     return $fake_peers{$host};
 }
 
+sub set_config_casesensitive {
+    my ($self, $val) = @_;
+    $DJabberd::JID::CASE_SENSITIVE = as_bool($val);
+}
+
 sub add_vhost {
     my ($self, $vhost) = @_;
     my $sname = lc $vhost->name;
diff --git a/DJabberd/lib/DJabberd/JID.pm b/DJabberd/lib/DJabberd/JID.pm
index f37e29c..bfef24c 100644
--- a/DJabberd/lib/DJabberd/JID.pm
+++ b/DJabberd/lib/DJabberd/JID.pm
@@ -3,6 +3,9 @@ use strict;
 use DJabberd::Util qw(exml);
 use Digest::SHA1;
 
+# Configurable via 'CaseSensitive' config option
+our $CASE_SENSITIVE = 0;
+
 use overload
     '""' => \&as_string_exml;
 
@@ -107,6 +110,10 @@ sub new {
            (?: /(.{1,1023})   )?                                           # $3: optional resource
            $!x;
 
+    # If we're in case-sensitive mode, for backwards-compatibility,
+    # then skip stringprep
+    return bless [ $1, $2, $3 ], $_[0] if $DJabberd::JID::CASE_SENSITIVE;
+
     # Stringprep uses regexes, so store these away first
     my ($node, $host, $res) = ($1, $2, $3);
 
-- 
1.6.3.3.473.gb74fc4.dirty

>From 2996bf98e3597ce928f2cd4e826542621c83dd76 Mon Sep 17 00:00:00 2001
From: Alex Vandiver <ale...@bestpractical.com>
Date: Thu, 24 Sep 2009 17:35:42 -0400
Subject: [PATCH 4/5] Add tool to unify mixed-case records in existing SQLite rosters

---
 .../fix-jabber-roster-case                         |  176 ++++++++++++++++++++
 1 files changed, 176 insertions(+), 0 deletions(-)
 create mode 100755 DJabberd-RosterStorage-SQLite/fix-jabber-roster-case

diff --git a/DJabberd-RosterStorage-SQLite/fix-jabber-roster-case b/DJabberd-RosterStorage-SQLite/fix-jabber-roster-case
new file mode 100755
index 0000000..e3760f8
--- /dev/null
+++ b/DJabberd-RosterStorage-SQLite/fix-jabber-roster-case
@@ -0,0 +1,176 @@
+#!/usr/bin/perl
+
+use DJabberd::JID;
+use DJabberd::Subscription;
+use DBI;
+use strict;
+use warnings;
+
+die "$0 must be run with a case-folding version of DJabberd\n"
+    unless DJabberd::JID->new('t...@example.com')->eq(DJabberd::JID->new('t...@example.com'));
+
+my $roster = shift @ARGV;
+die "Usage: $0 rosterfile\n" unless defined $roster and -f $roster;
+
+# Note that all work is performed in a transaction for consistency,
+# hence AutoCommit => 0
+my $dbh = DBI->connect(
+    "dbi:SQLite:dbname=$roster", "", "",
+    { RaiseError => 1, PrintError => 0, AutoCommit => 0 }
+);
+
+my $all_dups = $dbh->selectcol_arrayref(<<SQL);
+SELECT LOWER(first.jid)
+  FROM jidmap first
+       JOIN jidmap other
+         ON LOWER(first.jid) = LOWER(other.jid)
+       LEFT JOIN jidmap later
+         ON LOWER(later.jid) = LOWER(first.jid)
+        AND later.jidid > first.jidid
+ GROUP BY first.jid
+HAVING COUNT(DISTINCT other.jid) > 1
+   AND later.jid IS NULL
+ ORDER BY COUNT(DISTINCT other.jid), LOWER(first.jid)
+SQL
+
+for my $name (@{$all_dups}) {
+    warn "Looking at $name\n";
+
+    # Grab the set of dups
+    my $dups = $dbh->selectall_arrayref(<<SQL, undef, $name);
+SELECT jidid, jid FROM jidmap WHERE LOWER(jid) = ?
+SQL
+
+    # For later use, build up the set of JIDs ids to coalesce.
+    my $jidset = "(" . join(",", map {$_->[0]} @$dups) .")";
+    warn "  JID equivalence set is $jidset\n";
+
+    # Since this is the output of LOWER(jid), it is _probably_ already
+    # correct, but we run it through canonicalize to get KC
+    # normalization, etc
+    $name = canonicalize($name);
+    next unless defined $name;
+    warn "  Canonical form is $name\n";
+
+    # Blow away all of the duplicate JIDs, and re-insert the new name
+    $dbh->do("DELETE FROM jidmap WHERE jidid IN $jidset");
+    $dbh->do("INSERT INTO jidmap(jid) VALUES(?)", undef, $name);
+    my $newid = $dbh->last_insert_id(undef, undef, "jidmap", "jidid");
+    warn "  New row id is $newid\n";
+    
+    # Next, find all of the places in the roster where the old IDs
+    # showed up, either as userids or contactids
+    my %types = (userid => "contactid", contactid => "userid");
+    for my $column (keys %types) {
+        warn "  Looking for occurrances in $column\n";
+        my $other = $types{$column};
+        # We now generate a list of contacts that any of the JIDs as
+        # the user, (or users who have any of the JIDs as a contact)
+        my $otherlist = $dbh->selectcol_arrayref(<<SQL);
+SELECT $other FROM roster WHERE $column IN $jidset GROUP BY $other 
+SQL
+        for my $otherval (@$otherlist) {
+            my $merge = $dbh->selectall_arrayref(<<SQL, { Slice => {} });
+SELECT $column, name, subscription FROM roster WHERE $other = $otherval AND $column IN $jidset ORDER BY $column ASC
+SQL
+            # We now have a set of information to merge together, in $merge
+            warn "    $otherval references JIDs @{[map $_->{$column}, @$merge]} in $column\n";
+            $dbh->do(<<SQL, undef, $newid, $otherval, merge(@$merge));
+INSERT INTO roster($column, $other, name, subscription) VALUES(?, ?, ?, ?)
+SQL
+        }
+    }
+
+    # Delete any references to old JIDs in the roster
+    $dbh->do("DELETE FROM roster WHERE contactid IN $jidset OR userid IN $jidset");
+
+    # Merge roster groups
+    my $grouplist = $dbh->selectcol_arrayref(<<SQL);
+SELECT name FROM rostergroup WHERE userid IN $jidset GROUP BY name
+SQL
+    for my $groupname (@$grouplist) {
+        # Make a new group of this name for the new JID's id
+        $dbh->do("INSERT INTO rostergroup(userid, name) VALUES(?, ?)", undef, $newid, $groupname);
+        my $groupid = $dbh->last_insert_id(undef, undef, "rostergroup", "groupid");
+        # Find all of the distinct contacts in groups of this name in
+        # any of the old JIDs, and add them to the new group
+        my $merge = $dbh->do(<<SQL, undef, $groupid, $groupname);
+INSERT INTO groupitem(groupid, contactid)
+SELECT ?, groupitem.contactid
+  FROM rostergroup
+       JOIN groupitem
+         ON rostergroup.groupid = groupitem.groupid
+ WHERE rostergroup.name = ?
+   AND rostergroup.userid IN $jidset
+ GROUP BY groupitem.contactid
+SQL
+    }
+    # Remove the old groups
+    $dbh->do("DELETE FROM rostergroup WHERE userid IN $jidset");
+
+    # Look for places the any of the old JIDs appeared in other roster
+    # groups, and replace them with the new JID's id
+    $dbh->do(<<SQL, undef, $newid);
+INSERT INTO groupitem(groupid, contactid)
+SELECT groupid, ?
+  FROM groupitem
+ WHERE contactid IN $jidset
+ GROUP BY groupid
+SQL
+    # Remove the old contacts from the groups
+    $dbh->do("DELETE FROM groupitem WHERE contactid IN $jidset");
+
+}
+
+# The above merely handles cases where one JID appears twice; now we
+# iterate though _all_ names, and lower-case them, to catch JIDs which
+# are incorrectly cased but only appear once.
+my $all = $dbh->selectall_arrayref("SELECT jid, jidid FROM jidmap", { Slice => {} });
+for my $row (@{$all}) {
+    # Canonicalize the name.  Since this is the output of LOWER(jid),
+    # it is _probably_ already correct, but we round-trip through
+    # DJabberd::JID to ensure we get KC normalization, and the like
+    my $name = canonicalize($row->{jid});
+    next unless defined $name;
+    next if $name eq $row->{jid};
+    warn "Looking at @{[$row->{jid}]}\n";
+    warn "  Canonical form is $name\n";
+    $dbh->do("UPDATE jidmap SET jid = ? WHERE jidid = ?", undef, $name, $row->{jidid});
+}
+
+$dbh->commit or die "Error commiting changes: ".$dbh->errstr;
+
+
+
+
+sub canonicalize {
+    # Canonicalize the name.  We round-trip through DJabberd::JID to
+    # ensure we get KC normalization, case folding, and the like
+    my $name = shift;
+    my $jid = DJabberd::JID->new($name);
+    unless ($jid) {
+        warn "Can't make canonical form of $name!";
+        return undef;
+    }
+    return $jid->as_string;
+}
+
+sub merge {
+    my (@merge) = @_;
+
+    # Trivial case
+    return ($merge[0]->{name}, $merge[0]->{subscription})
+        if @merge == 1;
+
+    # More complex case; name is arbitrarily picked to be first non-null name, or null
+    my($name) = grep {defined} map {$_->{name}} @merge;
+    @merge = map {DJabberd::Subscription->from_bitmask($_->{subscription})} @merge;
+
+    # to overrides pendout, from overrides pendin
+    my $merged = DJabberd::Subscription->new;
+    $merged->{to}      = 1 if grep {$_->{to}} @merge;
+    $merged->{pendout} = 1 if not $merged->{to} and grep {$_->{pendout}} @merge;
+    $merged->{from}    = 1 if grep {$_->{from}} @merge;
+    $merged->{pendin}  = 1 if not $merged->{from} and grep {$_->{pendin}} @merge;
+    return $name, $merged->as_bitmask;
+}
-- 
1.6.3.3.473.gb74fc4.dirty

>From f2431a189c3282f4ef805548e53d182387c345f2 Mon Sep 17 00:00:00 2001
From: Alex Vandiver <a...@chmrr.net>
Date: Fri, 25 Sep 2009 14:21:34 -0400
Subject: [PATCH 5/5] Update CHANGES file

---
 DJabberd/CHANGES |    9 +++++++++
 1 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/DJabberd/CHANGES b/DJabberd/CHANGES
index cc4ca2c..155a9dd 100644
--- a/DJabberd/CHANGES
+++ b/DJabberd/CHANGES
@@ -1,3 +1,12 @@
+  - IMPORTANT: JIDs are now case insensitive, and properly UTF8
+    normalized, as the spec calls for.  If you wish to preserve the
+    old behavior, set 'CaseSensitive' in your configuration.  If you
+    have existing roster storage, you will need to ensure that
+    existing entries with differing case are merged.  If you use the
+    SQLite roster, 'fix-jabber-roster-case' has been provided in the
+    DJabberd::RosterStorage::SQLite distribution.
+    (Alex Vandiver <ale...@bestpractical.com>)
+
   - Some configuration file docs
     (Alex Vandiver <ale...@bestpractical.com>)
 
-- 
1.6.3.3.473.gb74fc4.dirty

JID case sensitivity, again

Reply via email to