commit 9f66c760d2ca3df6848c5850382ac6c1c657acc9
Author: Karsten Loesing <[email protected]>
Date:   Tue Aug 7 15:34:51 2012 +0200

    Add raw geoipdbcomp report from 2009.
---
 2009/geoipdbcomp/.gitignore                       |    3 +
 2009/geoipdbcomp/bridge-usage-2009-10-12.png      |  Bin 0 -> 16861 bytes
 2009/geoipdbcomp/bridge-usage-top5-2009-10-12.png |  Bin 0 -> 27259 bytes
 2009/geoipdbcomp/commmax.png                      |  Bin 0 -> 90670 bytes
 2009/geoipdbcomp/geoipdbcomp.tex                  |  193 +++++++++++++++++++++
 2009/geoipdbcomp/max-commmax.png                  |  Bin 0 -> 20813 bytes
 2009/geoipdbcomp/max.png                          |  Bin 0 -> 82233 bytes
 2009/geoipdbcomp/soft.png                         |  Bin 0 -> 63833 bytes
 2009/geoipdbcomp/tor-commmax.png                  |  Bin 0 -> 37825 bytes
 2009/geoipdbcomp/tor-max.png                      |  Bin 0 -> 29287 bytes
 2009/geoipdbcomp/tor-soft.png                     |  Bin 0 -> 29913 bytes
 2009/geoipdbcomp/tor.png                          |  Bin 0 -> 75405 bytes
 12 files changed, 196 insertions(+), 0 deletions(-)

diff --git a/2009/geoipdbcomp/.gitignore b/2009/geoipdbcomp/.gitignore
new file mode 100644
index 0000000..44cba54
--- /dev/null
+++ b/2009/geoipdbcomp/.gitignore
@@ -0,0 +1,3 @@
+geoipdbcomp.pdf
+geoipdbcomp-2009-10-23.pdf
+
diff --git a/2009/geoipdbcomp/bridge-usage-2009-10-12.png 
b/2009/geoipdbcomp/bridge-usage-2009-10-12.png
new file mode 100644
index 0000000..36db099
Binary files /dev/null and b/2009/geoipdbcomp/bridge-usage-2009-10-12.png differ
diff --git a/2009/geoipdbcomp/bridge-usage-top5-2009-10-12.png 
b/2009/geoipdbcomp/bridge-usage-top5-2009-10-12.png
new file mode 100644
index 0000000..a4cb0ce
Binary files /dev/null and b/2009/geoipdbcomp/bridge-usage-top5-2009-10-12.png 
differ
diff --git a/2009/geoipdbcomp/commmax.png b/2009/geoipdbcomp/commmax.png
new file mode 100644
index 0000000..391fc45
Binary files /dev/null and b/2009/geoipdbcomp/commmax.png differ
diff --git a/2009/geoipdbcomp/geoipdbcomp.tex b/2009/geoipdbcomp/geoipdbcomp.tex
new file mode 100644
index 0000000..6881b44
--- /dev/null
+++ b/2009/geoipdbcomp/geoipdbcomp.tex
@@ -0,0 +1,193 @@
+\documentclass{article}
+\usepackage{url}
+\usepackage[dvips]{graphicx}
+\usepackage{graphics}
+\usepackage{color}
+\usepackage{booktabs}
+\usepackage{multirow}
+\newcommand{\experimental}[1]{}
+\begin{document}
+\title{Comparison of GeoIP Databases for Tor}
+\author{Karsten Loesing\\\url{[email protected]}}
+\maketitle
+
+\begin{abstract}
+Tor uses a GeoIP database to resolve client IP addresses to country codes to 
get some basic statistics on connecting clients per country.
+Two recent events indicate that the GeoIP database that Tor ships is less 
accurate than expected:
+first, the update from the June 2009 to the September 2009 database removed 
almost all US IP addresses probably because of a provider-side database problem;
+second, in the aftermath of Tor being blocked in China at the end of September 
2009, not only Chinese bridge usage increased, but also usage from Japan and 
Australia, which is most likely a result of Chinese IP addresses falsely 
resolving to those countries.
+This report compares various, preferably free GeoIP databases for their 
accuracy in mapping IP addresses to country codes.
+In particular, accuracy is evaluated for countries that potentially censor the 
Internet.
+\end{abstract}
+
+\section{Motivation}
+
+Tor requires a GeoIP database that is as accurate as possible for resolving IP 
addresses of small countries like Iran or Tunisia.
+Two recent events indicate that the accuracy of the currently shipped database 
is not sufficient.
+
+\paragraph{Unreliable database updates}
+
+The GeoIP database in Tor is updated every few months, so as to reflect 
changes.
+However, the update from the June 2009 to September 2009 version introduced 
major changes to the database, removing almost all US IP addresses.
+In particular, the diff between old and new versions has 19833 deletions and 
only 10470 
insertions.\footnote{\url{http://archives.seul.org/or/cvs/Sep-2009/msg00269.html}}
+This likely corrupt database update indicates that the database provider is 
not as reliable as expected.
+
+\paragraph{False classification of Chinese IP addresses}
+
+Starting on September 25, 2009, the number of bridge users coming from China 
increased significantly as a response to the blocking of Tor relays in China.
+But at the same time, statistics show a significant increase of Australian and 
Japanese bridge usage for no good reason.
+The most likely explanation is that the GeoIP database falsely classifies 
Chinese IP addresses as belonging to either Australia or Japan.
+It is impossible to say whether the reason is that GeoIP databases on the 
bridges doing the resolution are outdated, or if the inaccuracy still persists 
in more recent versions.
+
+\begin{figure}
+\begin{minipage}{0.495\textwidth}
+\includegraphics[width=\linewidth]{bridge-usage-2009-10-12.png}
+\end{minipage}
+\begin{minipage}{0.495\textwidth}
+\includegraphics[width=\linewidth]{bridge-usage-top5-2009-10-12.png}
+\end{minipage}
+\caption{Possibly false classification of Chinese IP addresses as Australian 
or Japanese addresses}
+\label{fig:china}
+\end{figure}
+
+\section{Data basis}
+
+We have included the following four databases in the evaluation:
+
+\begin{enumerate}
+\item the June 2009 database from \url{http://ip-to-country.webhosting.info/} 
as it is included in current Tor versions,
+\item the most recent (as of October 19, 2009) freely available Maxmind 
database from \url{http://www.maxmind.com/},
+\item the most recent (as of October 19, 2009) GNUv3-licensed database from 
\url{http://software77.net/geo-ip/}, and
+\item a copy of the commercial Maxmind database from October 20, 2009.
+\end{enumerate}
+
+\section{Coverage of databases}
+
+We start with comparing what IP address ranges the GeoIP databases cover.
+We iterate over all $2^{32}$ possible IPv4 addresses and visualize what 
country codes these addresses evaluate to.
+The comparison result is an image of 1\,024 $\times$ 1\,024 pixels with the 
color of each pixel showing what country code the addresses resolve to.
+Every pixel in this image represents 4\,096 $= 2^{12}$ IP addresses, so that 
all 1\,024 $\times$ 1\,024 $= 2^{10} \times 2^{10}$ pixels display all 
4\,294\,967\,296 $=2^{32}$ possible IP addresses.
+The first line of the image contains IP addresses 0.0.0.0 to 0.63.255.255, the 
second line 0.64.0.0 to 0.127.255.255, etc.
+The colors are chosen rather arbitrarily just in order to distinguish 
different ranges:
+country codes from \texttt{AA} to \texttt{IQ} are displayed in different 
shades of red, codes from \texttt{IR} to \texttt{RH} in shades of green, and 
codes from \texttt{RI} to \texttt{ZZ} in shades of blue.
+
+In this and all subsequent analyses, the following ranges of reserved IP 
addresses have been removed from all databases:
+0.0.0.0/8, 10.0.0.0/8, 127.0.0.0/8, 169.254.0.0/16, 172.16.0.0/20, 
192.0.0.0/24, 192.0.2.0/24, 192.88.99.0/24, \linebreak192.168.0.0/16, 
198.18.0.0/16, 198.51.100.0/24, 203.0.113.0/24, and 224.0.0.0/3.
+%0.0.0.0..0.255.255.255                0.0.0.0/8
+%10.0.0.0..10.255.255.255      10.0.0.0/8
+%127.0.0.0..127.255.255.255    127.0.0.0/8
+%169.254.0.0..169.254.255.255  169.254.0.0/16
+%172.16.0.0..172.31.255.255    172.16.0.0/20
+%192.0.0.0..192.0.0.255                192.0.0.0/24
+%192.0.2.0..192.0.2.255                192.0.2.0/24
+%192.88.99.0..192.88.99.255    192.88.99.0/24
+%192.168.0.0..192.168.255.255  192.168.0.0/16
+%198.18.0.0..198.19.255.255    198.18.0.0/16
+%198.51.100.0..198.51.100.255  198.51.100.0/24
+%203.0.113.0..203.0.113.255    203.0.113.0/24
+%224.0.0.0..255.255.255.255    224.0.0.0/3
+We further removed non-country codes like \texttt{A1} (Anonymous Proxy) and 
\texttt{A2} (Satellite Provider) from both Maxmind databases before the 
analysis.
+
+Figure~\ref{fig:comp} shows the visualizations of the four databases.
+These images show the general distribution of assigned IP addresses with the 
reserved ranges being blank.
+However, the differences between the databases seem to be negligible on these 
images.
+
+\begin{figure}[t]
+\begin{minipage}{\linewidth}
+\centering
+\includegraphics[width=.45\textwidth]{tor.png}
+\includegraphics[width=.45\textwidth]{max.png}
+\end{minipage}
+
+\vspace{0.1cm}
+\begin{minipage}{\linewidth}
+\centering
+\includegraphics[width=.45\textwidth]{soft.png}
+\includegraphics[width=.45\textwidth]{commmax.png}
+\end{minipage}
+\caption{IP address coverages of Tor database (top left), free Maxmind 
database (top right), Software 77 database (bottom left), and commercial 
Maxmind database (bottom right)}
+\label{fig:comp}
+\end{figure}
+
+\section{Pairwise comparison of databases}
+
+In the next step towards evaluating accuracy of the various GeoIP databases, 
we perform a pair-wise comparison.
+Obviously, this comparison cannot show which database is more accurate than 
the other, but we might be able to detect artifacts by using this approach.
+
+The evaluation iterates over all $2^{32}$ possible IPv4 addresses and compares 
the results of the two databases. There are four possible cases for this 
comparison:
+
+\begin{enumerate}
+\item only the first database resolves the IP address to a country code,
+\item only the second database resolves the IP address to a country code,
+\item both databases resolve the IP address to two different country codes, or
+\item both databases resolve the IP address to the same country code.
+\end{enumerate}
+
+We visualize the difference between the two databases for the number of cases 
1 to 3.
+The comparison result is, again, an image of 1\,024 $\times$ 1\,024 pixels 
with the color of each pixel showing how different the two databases are.
+% Every pixel in this image represents 4\,096 $= 2^{12}$ IP addresses, so that 
all 1\,024 $\times$ 1\,024 $= 2^{10} \times 2^{10}$ pixels display all 
4\,294\,967\,296 $=2^{32}$ possible IP addresses.
+% The first line of the image contains IP addresses 0.0.0.0 to 0.63.255.255, 
the second line 0.64.0.0 to 0.127.255.255, etc.
+The pixel color visualizes the fraction of cases (1 to 3) that could be 
observed when comparing the two compared databases:
+addresses that are only contained in the first database (case 1) are displayed 
in different shades of red;
+IP addresses that were only found in the second database (case 2) are 
displayed in shades of blue;
+addresses having different country results in both databases (case 3) are 
displayed in shades of green;
+% if more than one case occurs in a range of 4\,096 addresses, only the 
dominating case is displayed;
+addresses that are resolved to the same country (case 4) do not add any color 
to the pixel.
+For example, a full line (or even area) of red means that only the first 
database contains a resolution for the IP addresses in the given range.
+
+Figures~\ref{fig:pair} shows the comparison of the Tor database with the three 
other databases as well as the comparison of free and commercial Maxmind 
databases.
+The comparison of the Tor database with both the free and the commercial 
Maxmind database shows a large number of red and blue lines indicating 
addresses are only contained in either of the databases.
+In addition to that, there are some green lines showing that the databases 
disagree on the country resolution.
+
+The comparison of the Tor database with the Software 77 database shows a 
rather different picture.
+These two databases disagree in many more places, as shown by the large green 
areas.
+The image also shows some artifacts in the middle of the image.
+Many of the green lines are exactly 65\,536 IP addresses long, which 
corresponds to a /16 network.
+These lines are not visible in the comparison to the Maxmind databases.
+It might be that the Software 77 database has a much lower resolution than the 
other databases.
+
+The comparison of the free with the commercial Maxmind database shows only 
very few differences which are mostly red and blue lines. This means that the 
two databases cover slightly different IP address ranges, but in general they 
are very similar.
+
+\begin{figure}[t]
+\begin{minipage}{\linewidth}
+\centering
+\includegraphics[width=.45\textwidth]{tor-max.png}
+\includegraphics[width=.45\textwidth]{tor-soft.png}
+\end{minipage}
+
+\vspace{0.1cm}
+\begin{minipage}{\linewidth}
+\centering
+\includegraphics[width=.45\textwidth]{tor-commmax.png}
+\includegraphics[width=.45\textwidth]{max-commmax.png}
+\end{minipage}
+\caption{Comparison of the Tor database with the free Maxmind database (top 
left), with the Software 77 database (top right), with the commercial Maxmind 
database (bottom left), and comparison of free with commercial Maxmind database 
(bottom right); red = only in first database, blue = only in second database, 
green = different results}
+\label{fig:pair}
+\end{figure}
+
+\section{Work left to do}
+
+\begin{itemize}
+\item Which are the official reserved address ranges? The ones listed here 
have been taken from the Software 77 database.
+\item Do we need to handle region codes like \texttt{AP} (Asia/Pacific Region) 
and \texttt{EU} (Europe) in a special way?
+\item In the next step, focus only on possibly censoring countries that are 
interesting to Tor: Azerbaijan (AZ), Belarus (BY), China (CN), Egypt (EG), Iran 
(IR), Jordan (JO), Kazakhstan (KZ), Morocco (MA), Myanmar (MM), Pakistan (PK), 
Russia (RU), Saudi Arabia (SA), Sudan (SD), Syria (SY), Tunisia (TN), U.A.E. 
(AE), Uzbekistan (UZ), Viet Nam (VN), and Yemen (YE). As a possible (though not 
perfect) metric: how many IP addresses do the GeoIP addresses resolve to these 
countries? The more, the better?
+\item Try confirming/falsifying samples of resolutions by making requests to 
the WHOIS database or using some other networking fu.
+\item Try to learn what changes in the regular updates: are those only new 
assignments, or are existing ranges re-assigned to other countries, maybe even 
following a pattern?
+\end{itemize}
+
+%karsten@x61s:~/Desktop/geoipdb/geoipdbcomp/data$ grep -m 5 AP maxmind.csv 
+%"59.151.128.0","59.151.191.255","999784448","999800831","AP","Asia/Pacific 
Region"
+%999784448,999800831,AU
+%
+%"61.14.128.88","61.14.128.95","1024360536","1024360543","AP","Asia/Pacific 
Region"
+%1024360536,1024360543,AU
+%
+%"61.14.130.48","61.14.130.63","1024361008","1024361023","AP","Asia/Pacific 
Region"
+%1024361008,1024361039,AU ***
+%
+%"61.14.130.136","61.14.130.143","1024361096","1024361103","AP","Asia/Pacific 
Region"
+%1024361096,1024361103,AU
+%
+
+\end{document}
+
diff --git a/2009/geoipdbcomp/max-commmax.png b/2009/geoipdbcomp/max-commmax.png
new file mode 100644
index 0000000..cf498cf
Binary files /dev/null and b/2009/geoipdbcomp/max-commmax.png differ
diff --git a/2009/geoipdbcomp/max.png b/2009/geoipdbcomp/max.png
new file mode 100644
index 0000000..cebec69
Binary files /dev/null and b/2009/geoipdbcomp/max.png differ
diff --git a/2009/geoipdbcomp/soft.png b/2009/geoipdbcomp/soft.png
new file mode 100644
index 0000000..9bd96fd
Binary files /dev/null and b/2009/geoipdbcomp/soft.png differ
diff --git a/2009/geoipdbcomp/tor-commmax.png b/2009/geoipdbcomp/tor-commmax.png
new file mode 100644
index 0000000..4842ae6
Binary files /dev/null and b/2009/geoipdbcomp/tor-commmax.png differ
diff --git a/2009/geoipdbcomp/tor-max.png b/2009/geoipdbcomp/tor-max.png
new file mode 100644
index 0000000..cbe5529
Binary files /dev/null and b/2009/geoipdbcomp/tor-max.png differ
diff --git a/2009/geoipdbcomp/tor-soft.png b/2009/geoipdbcomp/tor-soft.png
new file mode 100644
index 0000000..7569929
Binary files /dev/null and b/2009/geoipdbcomp/tor-soft.png differ
diff --git a/2009/geoipdbcomp/tor.png b/2009/geoipdbcomp/tor.png
new file mode 100644
index 0000000..23e37de
Binary files /dev/null and b/2009/geoipdbcomp/tor.png differ



_______________________________________________
tor-commits mailing list
[email protected]
https://lists.torproject.org/cgi-bin/mailman/listinfo/tor-commits

Reply via email to