commit 9495a0361e2dbd603038fe0ac2bfc29db7e9b2e5
Author: Karsten Loesing <[email protected]>
Date:   Mon Dec 22 11:04:12 2014 +0100

    Add Virgil's Tor growth report.
---
 2014/tor-growth/.gitignore                         |    3 +
 .../figs/5a--normalized-torperf-small.png          |  Bin 0 -> 130729 bytes
 .../figs/NUR-predicts-norm-torperf-large.png       |  Bin 0 -> 182833 bytes
 .../figs/NUR-predicts-norm-torperf-small.png       |  Bin 0 -> 183043 bytes
 2014/tor-growth/figs/appendix--NUR-over-time.png   |  Bin 0 -> 109859 bytes
 2014/tor-growth/figs/fig1-relays.png               |  Bin 0 -> 91418 bytes
 2014/tor-growth/figs/fig2-bw.png                   |  Bin 0 -> 91863 bytes
 2014/tor-growth/figs/fig3-mean-bw.png              |  Bin 0 -> 109277 bytes
 2014/tor-growth/figs/fig4--torperf.png             |  Bin 0 -> 213045 bytes
 ...fig6--NUR-predicts-normalized-torperf_large.png |  Bin 0 -> 147906 bytes
 .../figs/fig6--NUR-predicts-normtorperf_small.png  |  Bin 0 -> 149461 bytes
 2014/tor-growth/figs/non-tor-bw.png                |  Bin 0 -> 108303 bytes
 2014/tor-growth/scripts/NUR-predicts-normtorperf.R |  211 +++++++++++++++++
 2014/tor-growth/scripts/NUR.R                      |  102 ++++++++
 2014/tor-growth/scripts/bandwidth-per-relay.R      |   95 ++++++++
 2014/tor-growth/scripts/bandwidth.R                |   90 +++++++
 2014/tor-growth/scripts/doubling-table.R           |  148 ++++++++++++
 2014/tor-growth/scripts/non-tor-bandwidth.R        |   70 ++++++
 2014/tor-growth/scripts/relays.R                   |   97 ++++++++
 2014/tor-growth/scripts/torperf.R                  |  136 +++++++++++
 2014/tor-growth/tor-growth.tex                     |  245 ++++++++++++++++++++
 2014/tor-growth/tortechrep.cls                     |    1 +
 22 files changed, 1198 insertions(+)

diff --git a/2014/tor-growth/.gitignore b/2014/tor-growth/.gitignore
new file mode 100644
index 0000000..e4ba2a9
--- /dev/null
+++ b/2014/tor-growth/.gitignore
@@ -0,0 +1,3 @@
+tor-growth.pdf
+tor-growth-2014-10-04.pdf
+
diff --git a/2014/tor-growth/figs/5a--normalized-torperf-small.png 
b/2014/tor-growth/figs/5a--normalized-torperf-small.png
new file mode 100644
index 0000000..b13592b
Binary files /dev/null and 
b/2014/tor-growth/figs/5a--normalized-torperf-small.png differ
diff --git a/2014/tor-growth/figs/NUR-predicts-norm-torperf-large.png 
b/2014/tor-growth/figs/NUR-predicts-norm-torperf-large.png
new file mode 100644
index 0000000..4798a36
Binary files /dev/null and 
b/2014/tor-growth/figs/NUR-predicts-norm-torperf-large.png differ
diff --git a/2014/tor-growth/figs/NUR-predicts-norm-torperf-small.png 
b/2014/tor-growth/figs/NUR-predicts-norm-torperf-small.png
new file mode 100644
index 0000000..2b08b3b
Binary files /dev/null and 
b/2014/tor-growth/figs/NUR-predicts-norm-torperf-small.png differ
diff --git a/2014/tor-growth/figs/appendix--NUR-over-time.png 
b/2014/tor-growth/figs/appendix--NUR-over-time.png
new file mode 100644
index 0000000..6badfc6
Binary files /dev/null and b/2014/tor-growth/figs/appendix--NUR-over-time.png 
differ
diff --git a/2014/tor-growth/figs/fig1-relays.png 
b/2014/tor-growth/figs/fig1-relays.png
new file mode 100644
index 0000000..16d4a29
Binary files /dev/null and b/2014/tor-growth/figs/fig1-relays.png differ
diff --git a/2014/tor-growth/figs/fig2-bw.png b/2014/tor-growth/figs/fig2-bw.png
new file mode 100644
index 0000000..65f2456
Binary files /dev/null and b/2014/tor-growth/figs/fig2-bw.png differ
diff --git a/2014/tor-growth/figs/fig3-mean-bw.png 
b/2014/tor-growth/figs/fig3-mean-bw.png
new file mode 100644
index 0000000..5da852b
Binary files /dev/null and b/2014/tor-growth/figs/fig3-mean-bw.png differ
diff --git a/2014/tor-growth/figs/fig4--torperf.png 
b/2014/tor-growth/figs/fig4--torperf.png
new file mode 100644
index 0000000..ba32cd4
Binary files /dev/null and b/2014/tor-growth/figs/fig4--torperf.png differ
diff --git 
a/2014/tor-growth/figs/fig6--NUR-predicts-normalized-torperf_large.png 
b/2014/tor-growth/figs/fig6--NUR-predicts-normalized-torperf_large.png
new file mode 100644
index 0000000..5983fe2
Binary files /dev/null and 
b/2014/tor-growth/figs/fig6--NUR-predicts-normalized-torperf_large.png differ
diff --git a/2014/tor-growth/figs/fig6--NUR-predicts-normtorperf_small.png 
b/2014/tor-growth/figs/fig6--NUR-predicts-normtorperf_small.png
new file mode 100644
index 0000000..4036d05
Binary files /dev/null and 
b/2014/tor-growth/figs/fig6--NUR-predicts-normtorperf_small.png differ
diff --git a/2014/tor-growth/figs/non-tor-bw.png 
b/2014/tor-growth/figs/non-tor-bw.png
new file mode 100644
index 0000000..acdb646
Binary files /dev/null and b/2014/tor-growth/figs/non-tor-bw.png differ
diff --git a/2014/tor-growth/scripts/NUR-predicts-normtorperf.R 
b/2014/tor-growth/scripts/NUR-predicts-normtorperf.R
new file mode 100644
index 0000000..ca01f45
--- /dev/null
+++ b/2014/tor-growth/scripts/NUR-predicts-normtorperf.R
@@ -0,0 +1,211 @@
+rm( list = ls() )
+setwd('/Users/know/Desktop/tor analytics/')
+library(car)    # for pretty plots
+#library(matlab) # for matlab function names
+#library(data.table) # for data.table like data.frame
+#library(xtable) # for exporting to LaTeX
+#library(gdata)
+#library(lmtest) # for testing linear models
+
+
+library(plyr) # for renaming columns
+source("colortitles.R")
+#######################################################
+
+process_torperf_rawdata <- function( filename, filesize_to_consider=5242880 )
+{
+  
+  # Import data from TorPerf
+  Dtp <- read.csv( filename )[c('date','size','source','q1','md','q3')]
+  Dtp <- rename(Dtp, c("size"="filesize") )
+  
+  print(unique(Dtp$filesize))
+  # only use the aggregated Tor data for downloading a 5 MiB file
+  Dtp <- subset( Dtp, source=='' & filesize==filesize_to_consider )
+  
+  #print( tail(Dtp) )
+  
+  # drop the source and filesize column
+  #Dtp <- Dtp[ , -which(names(Dtp) %in% c('source','filesize'))]
+  Dtp <- Dtp[ , -which(names(Dtp) %in% c('source'))]
+  
+  # rename the q1, md, and q3 for TIME
+  Dtp <- rename(Dtp, c("q1"="time_q1","md"="time_md","q3"="time_q3") )
+  
+  # convert time from MILLISECONDS -> SECONDS
+  Dtp$time_q1 <- Dtp$time_q1/1000
+  Dtp$time_md <- Dtp$time_md/1000
+  Dtp$time_q3 <- Dtp$time_q3/1000
+  
+  
+  # now create the bw_q1, bw_md, bw_q3 in: KiB/s
+  Dtp[c("bw_q1","bw_md","bw_q3")] <- c(NA,NA,NA)
+  
+  #convert my_filesize to 
+  
+  # Rewrite q1, md, and q3 to be in bandwidth (KB/s)
+  Dtp$bw_q1 <- (filesize_to_consider / 1024) / Dtp$time_q1;
+  Dtp$bw_md <- (filesize_to_consider / 1024) / Dtp$time_md;
+  Dtp$bw_q3 <- (filesize_to_consider / 1024) / Dtp$time_q3;
+  
+  return(Dtp)
+}
+
+
+composite_nontor_bandwidth <- function( filename )
+{
+  D_netindex <- read.csv( filename )[c('date','country_code','download_kbps')]
+  
+  D_netindex$download_kbps <- D_netindex$download_kbps * (1000 / 1024)
+  
+  
+  # make a single download_rate averaging across the countries: US, DE, RU.
+  D_US = subset( D_netindex, country_code=='US' )
+  D_DE = subset( D_netindex, country_code=='DE' )
+  D_RU = subset( D_netindex, country_code=='RU' )
+  
+  # merge the US, DE, and RU bandwidths
+  D_temp <- merge( D_US, D_DE, by='date' )
+  D_ni <- merge( D_temp, D_RU, by='date' )
+  
+  # drop the country codes
+  D_ni <- D_ni[ , -which(names(D_ni) %in% 
c('country_code.x','country_code.y','country_code'))]
+  
+  # average the download KiB/s entries into one
+  D_ni$avr_download_KBps <- NA
+  D_ni$avr_download_KBps <- (D_ni$download_kbps.x + D_ni$download_kbps.y + 
D_ni$download_kbps) / 3.0
+  
+  # drop the country-specific download rates
+  D_ni <- D_ni[ , -which(names(D_ni) %in% 
c('download_kbps.x','download_kbps.y','download_kbps'))]
+  
+  D_ni <- rename(D_ni, c("avr_download_KBps"="download_kbps") )
+  
+  return( D_ni )
+  
+}
+
+# readin the Tor bandwidth data
+#################################################################
+Dbw <- read.csv('bandwidth-clean.csv')[c('day','advbw','date','bwread')]
+
+# convert units from from B/s to KiB/s
+Dbw$bwread <- Dbw$bwread / 1024
+Dbw$advbw <- Dbw$advbw / 1024
+
+
+## plot the clients and advertised bandwidth with time
+#plot(day, log2(clients), pch=20, ylab='Clients', xlab='Year', xaxt='n')
+
+Dbw[c("congestion")] <- c(NA)
+Dbw$congestion <- Dbw$bwread / Dbw$advbw
+
+# remove all instances of Dbw with congestion==NA
+Dbw <- subset(Dbw, ! is.na(congestion) )
+Dbw <- subset(Dbw, day > 917 ) 
+
+# remove some outliers
+Dbw <- Dbw[-c(318,367,2),]
+
+
+# readin the non-Tor bandwidth data
+#################################################################
+D_netindex <- composite_nontor_bandwidth('country_daily_speeds.csv')
+
+# Pull in the torperf data for downloading a 5 MiB file
+#################################################################
+#Dtp <- process_torperf_rawdata('torperf-clean.csv',51200)
+Dtp <- process_torperf_rawdata('torperf-clean.csv',5242880)
+
+# now merge Dtp and D by date
+Dnorm <- merge(D_netindex,Dtp,by='date')
+
+# drop some columns we don't need
+Dnorm <- Dnorm[ , -which(names(Dnorm) %in% 
c('time_q1','time_md','time_q3','country_code','filesize','bw_q1','bw_q3','day'))]
+
+Dnorm$normalized_torperf <- NA
+Dnorm$normalized_torperf <- (Dnorm$bw_md / Dnorm$download_kbps) * 100
+
+# now merge Dnorm and Dbw into D
+D <- merge(Dnorm, Dbw, by='date' )
+
+D <- D[ , -which(names(D) %in% c('download_kbps','bw_md','advbw','bwread'))]
+
+#plot( D$day, D$normalized_torperf )
+#plot( D$congestion, D$normalized_torperf )
+
+m0 <- lm( D$normalized_torperf ~ D$day )
+m1 <- lm( D$normalized_torperf ~ D$congestion )
+m2 <- lm( D$normalized_torperf ~ D$day+D$congestion )
+
+
+#m3 <- lm( log2(D$normalized_torperf) ~ D$congestion+D$day )
+
+# break D into years and more/less effected by the botnet
+D2010 <- D[grepl("2010", D$date), ]
+D2011 <- D[grepl("2011", D$date), ]
+D2012 <- D[grepl("2012", D$date), ]
+D2013 <- D[grepl("2013", D$date), ]
+D2014 <- D[grepl("2014", D$date), ]
+
+
+#dim(D2012)
+par(las=1)
+
+#plot( D$day, D$normalized_torperf, ylab='Norm Torperf', xlab='day', cex=0.1 )
+#plot( D$congestion, D$normalized_torperf, ylab='Norm Torperf', 
xlab="Network's Read / Advertised Bandwidth", cex=0.2, xaxt='n', yaxt='n' )
+plot( D$congestion, D$normalized_torperf, ylab='Normalized Torperf', 
xlab="NUR", cex=0.2, xaxt='n', yaxt='n' )
+#abline( m1 )
+
+# plot as a function of advbw_per_client
+par(new = T)
+points(D2010$congestion, D2010$normalized_torperf, pch=20, cex=0.7, col='red' )
+points(D2011$congestion, D2011$normalized_torperf, pch=20, cex=0.7, 
col='orange' )
+points(D2012$congestion, D2012$normalized_torperf, pch=20, cex=0.7, 
col='green' )
+points(D2013$congestion, D2013$normalized_torperf, pch=20, cex=0.7, col='blue' 
)
+points(D2014$congestion, D2014$normalized_torperf, pch=20, cex=0.7, 
col='purple' )
+
+
+#title("Lower network utilization has little impact on Torperf (50 KiB)")
+#title("Lower network utilization implies faster Torperf (5 MiB)")
+
+## Set the Legend and Title
+##################################################################
+legend_texts = c("2010","2011","2012", "2013", "2014")
+legend( "topright", legend=legend_texts, inset=0.01, pch=c(20,20), 
col=c('red','orange','green','blue','purple') ) 
+
+
+CongestLabels=c('','50%','','60%','','70%','')
+CongestLocations=c(.45,.50,.55,.60,.65,.70,.75)
+axis(1,at=CongestLocations,labels=CongestLabels )
+
+
+# normalized Torperf labels
+# For 50 KiB
+#TorperfLabels=c('.05%','.1%','.15%','.2%','.25%')
+#TorperfLocations=c(.05,.1,.15,.2,.25)
+#axis(2,at=TorperfLocations,labels=TorperfLabels )
+
+# For 5 MiB
+TorperfLabels=c('.5%','1%','1.5%','2%','2.5%')
+TorperfLocations=c(.5,1,1.5,2,2.5)
+axis(2,at=TorperfLocations,labels=TorperfLabels )
+
+
+
+
+#################################################################
+
+nfit <- lm( normalized_torperf ~ congestion, data=D )
+summary( nfit )
+first_x <- .45
+last_x <- .75
+
+# add the line segment for the predicted pine
+segments( first_x, predict(nfit, data.frame(congestion=first_x)),
+          last_x, predict(nfit, data.frame(congestion=last_x)),
+          col="black", lty=2, lwd=3 )
+
+# Add a point highlighting the beginning of the line
+points( first_x, predict(nfit, data.frame(congestion=first_x)), col="black", 
pch=15, cex=1.3)
+
+
diff --git a/2014/tor-growth/scripts/NUR.R b/2014/tor-growth/scripts/NUR.R
new file mode 100644
index 0000000..68cd1a4
--- /dev/null
+++ b/2014/tor-growth/scripts/NUR.R
@@ -0,0 +1,102 @@
+  rm( list = ls() )
+  setwd('/Users/know/Desktop/tor analytics/')
+  library(car)    # for pretty plots
+  
+  library(plyr) # for renaming columns
+  source("colortitles.R")
+  #######################################################
+  
+  D <- read.csv('bandwidth-clean.csv')[c('day','advbw','bwread','date')]
+  #D <- rename(D, c("date"="day") )
+  
+  # convert from B/s to MiB/s
+  D$advbw <- D$advbw / 1048576.0
+  D$bwread <- D$bwread / 1048576.0
+  
+  ## Plot the "Congestion" --- read / advertised
+  ##################################################################
+  D[c("congestion")] <- NA
+  D$congestion <- D$bwread / D$advbw
+  
+  ####### Remove some outliers 
####################################################
+  Dother <- subset(D, congestion <= 0.01 )
+  D <- subset(D, congestion > 0.01 )
+  # drop all points between days [1200,1310] with congestion BELOW 0.55
+  outliers <- subset(D, ( 1200 <= day & day <= 1310 & congestion <=0.55) | 
(2258 <= day & day <= 2320 & congestion <= 0.47) )
+  
+  D <- subset(D, !( 1200 <= day & day <= 1310 & congestion <=0.55) )
+  D <- subset(D, !(2258 <= day & day <= 2320 & congestion <= 0.47) )
+  
#################################################################################
+  
+  ####### Put into groups ####################################################
+  cut_off1 <- 2173                    # delta=2173 is date 2013-10-08
+  cut_off2 <- 2413                    # delta=2413 is date 2014-06-05
+  g1 <- subset(D, day <= cut_off1 & congestion >= 0.5 )
+  g2 <- subset(D, cut_off1 < day & day <= cut_off2 )
+  g3 <- subset(D, cut_off2 < day )
+  
#################################################################################
+  
+  par(las=1)
+  plot(D$day, D$congestion,
+       col='black', pch='.', cex=0.6, ylim=c(0.35,0.8), 
+       #xlab="Year", ylab="used bandwidth / capacity bandwidth", xaxt='n', 
yaxs="i")
+       xlab="Year", ylab="NUR", xaxt='n', yaxs="i")
+  
+  # plot the three groups
+  points( g1$day, g1$congestion, col='red', pch=20, cex=0.6 )
+  points( g2$day, g2$congestion, col='blue', pch=20, cex=0.6 )
+  points( g3$day, g3$congestion, col='green', pch=20, cex=0.6 )
+  
+  # plot the outliers
+  #points( outliers$day, outliers$congestion, col='black', pch=1, cex=0.6 )
+  
+  
+  ####### Set the pretty X-axis ############################
+  par(las=1)
+  YearLabels=seq(from=2008,to=2014,by=1)
+  YearLocations=c(66,432,797,1162,1527,1893,2258)
+  axis(1,at=YearLocations,labels=YearLabels )
+  ##########################################################
+  
+  
+  ## Plot the three best-fit lines
+  #################################################################
+  g1 <- subset(g1, day >= 1200)
+  fit_D <-  lm( congestion ~ day, data=D )
+  fit_g1 <- lm( congestion ~ day, data=g1 )
+  fit_g2 <- lm( congestion ~ day, data=g2 )
+  fit_g3 <- lm( congestion ~ day, data=g3 )
+  
+  
+  # add the line segment for the predicted pine
+  segments( min(g1$day), predict(fit_g1, data.frame(day=min(g1$day))),
+            max(g1$day), predict(fit_g1, data.frame(day=max(g1$day))),
+            col="black", lty=1, lwd=3 )
+  
+  segments( min(g2$day), predict(fit_g2, data.frame(day=min(g2$day))),
+            max(g2$day), predict(fit_g2, data.frame(day=max(g2$day))),
+            col="black", lty=1, lwd=3 )
+  
+  segments( min(g3$day), predict(fit_g3, data.frame(day=min(g3$day))),
+            max(g3$day), predict(fit_g3, data.frame(day=max(g3$day))),
+            col="black", lty=1, lwd=3 )
+  
+  
+  # Add a point highlighting the beginning of the line
+  points( min(g1$day), predict(fit_g1, data.frame(day=min(g1$day))), 
col="black", pch=15, cex=1.3)
+  points( min(g2$day), predict(fit_g2, data.frame(day=min(g2$day))), 
col="black", pch=15, cex=1.3)
+  points( min(g3$day), predict(fit_g3, data.frame(day=min(g3$day))), 
col="black", pch=15, cex=1.3)
+  
+  
+  
+  
+  ## Set the Title and Legend
+  #################################################################
+  #title("Better: channel utilization in three distinct, flat stages")
+  legend_text = c('2010-04-30 to 2013-10-08', '2013-10-09 to 2014-06-05', 
'2014-06-06 to present')
+  legend( "bottomleft", legend=legend_text, inset=0.05, pch=c(20,20,20), 
col=c('red','blue','green') ) 
+  
+  
+  
+
+  
diff --git a/2014/tor-growth/scripts/bandwidth-per-relay.R 
b/2014/tor-growth/scripts/bandwidth-per-relay.R
new file mode 100644
index 0000000..426e7b3
--- /dev/null
+++ b/2014/tor-growth/scripts/bandwidth-per-relay.R
@@ -0,0 +1,95 @@
+rm( list = ls() )
+setwd('/Users/know/Desktop/tor analytics/')
+library(car)    # for pretty plots
+library(matlab) # for matlab function names
+library(data.table) # for data.table like data.frame
+#library(xtable) # for exporting to LaTeX
+#library(gdata)
+library(lmtest) # for testing linear models
+library(calibrate)
+
+library(plyr) # for renaming columns
+source("colortitles.R")
+#######################################################
+
+## Readin the data into data.frame D with columns: days, relays.stable, 
relays.fast, relays.all
+#######################################################
+#Dstable <- read.csv('relays-stable.csv')[c('date','relays')]
+#Dfast <- read.csv('relays-fast.csv')[c('date','relays')]
+#Dtemp <- merge(Dstable, Dfast, by="date", suffixes=c('.stable','.fast') )
+
+Dbandwidth <- read.csv('bandwidth-clean.csv')[c('date','advbw','bwread')]
+Drelays <- read.csv('relays-total.csv')[c('date','relays')]
+D <- merge( Dbandwidth, Drelays, by='date' )
+D <- rename(D, c("date"="day","relays"="relays.all"))
+names(D)
+
+# convert units from from B/s to MiB/s
+D$advbw <- D$advbw / 1048576.0
+D$bwread <- D$bwread / 1048576.0
+
+
+D[c("advbw_per_relay","bwread_per_relay")] <- c(NA,NA)
+D$advbw_per_relay <- D$advbw / D$relays.all
+D$bwread_per_relay <- D$bwread / D$relays.all
+
+plot(D$day, log2(D$advbw_per_relay), ylab='Bandwidth (MiB/s) per relay', 
xlab='Year', yaxt='n', pch=20, cex=0.6, xaxt='n', col='blue' )
+points( D$day, log2(D$bwread_per_relay), pch=20, cex=0.6, col='red' )
+
+
+####### Set the pretty X-axis ###################################
+YearLabels=seq(from=2008,to=2014,by=1)
+YearLocations=c(66,432,797,1162,1527,1893,2258)
+axis(1,at=YearLocations,labels=YearLabels )
+#################################################################
+
+####### Set the pretty Y-axis ###################################
+par(las=1)
+lab <- seq(from=-3,to=1,by=1)
+axis(2,at=lab, labels=c("⅛","¼","½","1","2") )
+#################################################################
+
+## Set the Legend and Title
+##################################################################
+legend_texts = c(
+  expression(paste("Capacity (advertised) ", r^2, "=0.91")),
+  expression(paste("Used (read)                ", r^2, "=0.68"))  
+)
+
+
+legend( "topleft", legend=legend_texts, inset=0.05, pch=c(20,20), 
col=c('blue','red') ) 
+
+
+multiTitle(color="black","Average bandwidth per relay doubles every ", 
+           color="blue","1.8",
+           color="black",'-',
+           color="red","2.1",
+           color="black"," years")
+
+####### Plot the best-fit lines ############################
+
+# remove data before the 'read' metric started
+temp <- subset( D, !is.na(bwread) )
+first_day <- min(temp$day)
+FD <- D[ which(D$day >= first_day), ]
+
+# fit the Filtered Data to a linear model...
+fit_advbw <- lm( log2(advbw_per_relay) ~ day, data=FD )
+fit_bwread <- lm( log2(bwread_per_relay) ~ day, data=FD )
+
+
+# Add the best-fit lines
+segments( first_day, predict(fit_advbw, data.frame(day=first_day)),
+          max(FD$day), predict(fit_advbw, data.frame(day=max(FD$day))),
+          col="black", lty=2, lwd=3 )
+segments( first_day, predict(fit_bwread, data.frame(day=first_day)),
+          max(FD$day), predict(fit_bwread, data.frame(day=max(FD$day))),
+          col="black", lty=2, lwd=3 )
+
+# Add the black squares
+points( first_day, predict(fit_advbw, data.frame(day=min(FD$day))), 
col="black", pch=15, cex=1.3)
+points( first_day, predict(fit_bwread, data.frame(day=min(FD$day))), 
col="black", pch=15, cex=1.3)
+
+#summary( fit_all )
+#summary( fit_stable )
+
diff --git a/2014/tor-growth/scripts/bandwidth.R 
b/2014/tor-growth/scripts/bandwidth.R
new file mode 100644
index 0000000..cc2ac0a
--- /dev/null
+++ b/2014/tor-growth/scripts/bandwidth.R
@@ -0,0 +1,90 @@
+rm( list = ls() )
+setwd('/Users/know/Desktop/tor analytics/')
+library(car)    # for pretty plots
+#library(matlab) # for matlab function names
+#library(data.table) # for data.table like data.frame
+#library(xtable) # for exporting to LaTeX
+#library(gdata)
+#library(lmtest) # for testing linear models
+#library(calibrate)
+
+library(plyr) # for renaming columns
+#source("colortitles.R")
+#######################################################
+
+D <- read.csv('bandwidth-clean.csv')[c('date','advbw','bwread','bwwrite')]
+D <- rename(D, c("date"="day") )
+
+# convert from B/s to MiB/s
+D$advbw <- D$advbw / 1048576.0
+D$bwread <- D$bwread / 1048576.0
+D$bwwrite <- D$bwwrite / 1048576.0
+
+Xs = D$day;
+Ys_advbw = D$advbw;
+Ys_bwread = D$bwread;
+Ys_bwwrite = D$bwwrite;
+
+# data
+plot(Xs, log2(Ys_advbw), xlab="Year", ylab="Bandwidth (MiB/s)", yaxt='n', 
xaxt='n', col='blue', pch=20, cex=0.6 )
+points(Xs, log2(Ys_bwread), pch=20, col='red', cex=0.6 )
+
+
+####### Set the pretty X-axis ############################
+YearLabels=seq(from=2008,to=2014,by=1)
+YearLocations=c(66,432,797,1162,1527,1893,2258)
+axis(1,at=YearLocations,labels=YearLabels )
+##########################################################
+
+####### Set the pretty Y-axis ############################
+par(las=1)
+lab <- seq(from=1,to=45,by=1)
+axis(2,at=lab,labels=parse(text=paste("2^", lab, sep="")) )
+##########################################################
+
+
+####### Plot the best-fit lines ############################
+temp <- subset( D, !is.na(bwread), c('day','bwread') )
+first_bwread_day <- min(temp$day)
+
+# remove data before year 2010
+#FD <- D[ which(D$day >= 797), ]
+D <- D[ which(D$day >= first_bwread_day), ]
+
+fit_advbw <- lm( log2(advbw) ~ day, data=D )
+fit_bwread <- lm( log2(bwread) ~ day, data=D )
+summary( fit_advbw )
+summary( fit_bwread )
+
+### we need this for the lines
+
+
+# Add the best-fit line (with black square) for the advertised bandwidth
+segments( min(D$day), predict(fit_advbw, data.frame(day=min(D$day))),
+          max(D$day), predict(fit_advbw, data.frame(day=max(D$day))),
+          col="black", lty=2, lwd=3 )
+points( min(D$day), predict(fit_advbw, data.frame(day=min(D$day))), 
col="black", pch=15, cex=1.3)
+
+# Add the best-fit line (with black square) for the read bandwidth
+segments( min(D$day), predict(fit_bwread, data.frame(day=min(D$day))),
+          max(D$day), predict(fit_bwread, data.frame(day=max(D$day))),
+          col="black", lty=2, lwd=3 )
+points( min(D$day), predict(fit_bwread, data.frame(day=min(D$day))), 
col="black", pch=15, cex=1.3)
+
+
+## Set the Legend and Title
+##################################################################
+legend_texts = c(
+  expression(paste("Capacity (advertised) ", r^2, "=0.97")),
+  expression(paste("Used (read)                ", r^2, "=0.88"))  
+)
+
+legend( "topleft", legend=legend_texts, inset=0.05, pch=c(20,20), 
col=c('blue','red') ) 
+
+multiTitle(color="black","Tor relay bandwidth doubles every ", 
+           color="blue","13",
+           color="black",'-',
+           color="red","14",
+           color="black"," months")
+
+
diff --git a/2014/tor-growth/scripts/doubling-table.R 
b/2014/tor-growth/scripts/doubling-table.R
new file mode 100644
index 0000000..db64b53
--- /dev/null
+++ b/2014/tor-growth/scripts/doubling-table.R
@@ -0,0 +1,148 @@
+rm( list = ls() )
+setwd('/Users/know/Desktop/tor analytics/')
+
+
+process_torperf_rawdata <- function( filename, filesize_to_consider=5242880 )
+{
+  
+  # Import data from TorPerf
+  Dtp <- read.csv( filename )[c('day','date','size','source','q1','md','q3')]
+  Dtp <- rename(Dtp, c("size"="filesize") )
+  
+  print(unique(Dtp$filesize))
+  # only use the aggregated Tor data for downloading a 5 MiB file
+  Dtp <- subset( Dtp, source=='' & filesize==filesize_to_consider )
+  
+  #print( tail(Dtp) )
+  
+  # drop the source and filesize column
+  #Dtp <- Dtp[ , -which(names(Dtp) %in% c('source','filesize'))]
+  Dtp <- Dtp[ , -which(names(Dtp) %in% c('source'))]
+  
+  # rename the q1, md, and q3 for TIME
+  Dtp <- rename(Dtp, c("q1"="time_q1","md"="time_md","q3"="time_q3") )
+  
+  # convert time from MILLISECONDS -> SECONDS
+  Dtp$time_q1 <- Dtp$time_q1/1000
+  Dtp$time_md <- Dtp$time_md/1000
+  Dtp$time_q3 <- Dtp$time_q3/1000
+  
+  
+  # now create the bw_q1, bw_md, bw_q3 in: KiB/s
+  Dtp[c("bw_q1","bw_md","bw_q3")] <- c(NA,NA,NA)
+  
+  #convert my_filesize to 
+  
+  # Rewrite q1, md, and q3 to be in bandwidth (KiB/s)
+  Dtp$bw_q1 <- (filesize_to_consider / 1024) / Dtp$time_q1;
+  Dtp$bw_md <- (filesize_to_consider / 1024) / Dtp$time_md;
+  Dtp$bw_q3 <- (filesize_to_consider / 1024) / Dtp$time_q3;
+  
+  return(Dtp)
+}
+
+#####################################################################################
+# Number of relays
+#####################################################################################
+
+# remove all entries with values below 937
+Dall <- read.csv('relays-total.csv')[c('date','relays')]
+Dall$day <- NA
+Dall$day <- 1:nrow(Dall)
+
+plot( Dall$day, log2(Dall$relays), pch=20, cex=0.6, col='blue' )
+
+Dall <- subset(Dall, relays >= 938 )
+points(Dall$day, log2(Dall$relays), pch=20, cex=0.6, col='red' )
+
+mm <- lm( log2(Dall$relays) ~ Dall$day  )
+
+rows_to_remove <- abs(resid(mm)) > 0.38
+Dall <- Dall[ !rows_to_remove, ]
+
+points(Dall$day, log2(Dall$relays), pch=20, cex=0.6, col='purple' )
+
+mm <- lm( log2(Dall$relays) ~ Dall$day  )
+
+# from here the doubling rate is gotten by
+(1.0 / mm$coefficients) / 365
+
+# which comes out to 2.99 years
+
+#####################################################################################
+# total network bandwidth
+#####################################################################################
+rm( list = ls() )
+D <- read.csv('bandwidth-clean.csv')[c('date','advbw','bwread')]
+D$advbw <- D$advbw / 1048576.0
+#D <- rename(D, c("date"="day") )
+D$day <- NA
+D$day <- 1:nrow(D)
+
+# convert from B/s to MiB/s
+
+m <- lm( log2(D$advbw) ~ D$day )
+
+# now for the monthly doubling rate
+(1.0 / m$coefficients) / 30
+
+
+#####################################################################################
+# Absolute Torperf
+#####################################################################################
+#rm( list = ls() )
+D_SMALL <- process_torperf_rawdata('torperf-clean.csv', 
51200)[c('date','day','bw_md')]
+D_BIG <- process_torperf_rawdata('torperf-clean.csv', 
5242880)[c('date','day','bw_md')]
+
+
+D_SMALL <- subset( D_SMALL, day>=547 )
+D_BIG <- subset( D_BIG, day>=547 )
+
+
+mSMALL <- lm( log2(bw_md) ~ day, data=D_SMALL )
+mBIG <- lm( log2(bw_md) ~ day, data=D_BIG )
+
+(1.0 / mSMALL$coefficients) / 30
+(1.0 / mBIG$coefficients) / 30
+
+
+
+#####################################################################################
+# Normalized Torperf
+#####################################################################################
+
+# Readin the netindex and average it
+#######################################################
+D_netindex <- 
read.csv('country_daily_speeds.csv')[c('date','country_code','download_kbps')]
+
+D_netindex$download_kbps <- D_netindex$download_kbps * (1000 / 1024)
+
+
+# make a single download_rate averaging across the countries: US, DE, RU.
+D_US = subset( D_netindex, country_code=='US' )
+D_DE = subset( D_netindex, country_code=='DE' )
+D_RU = subset( D_netindex, country_code=='RU' )
+
+# merge the US, DE, and RU bandwidths
+D_temp <- merge( D_US, D_DE, by='date' )
+D_ni <- merge( D_temp, D_RU, by='date' )
+
+# drop the country codes
+D_ni <- D_ni[ , -which(names(D_ni) %in% 
c('country_code.x','country_code.y','country_code'))]
+
+# average the download KiB/s entries into one
+D_ni$avr_download_KBps <- NA
+D_ni$avr_download_KBps <- (D_ni$download_kbps.x + D_ni$download_kbps.y + 
D_ni$download_kbps) / 3.0
+
+# drop the country-specific download rates
+D_ni <- D_ni[ , -which(names(D_ni) %in% 
c('download_kbps.x','download_kbps.y','download_kbps'))]
+
+
+# now merge D_ni and {D_SMALL,D_BIG} based on date
+Dnorm_SMALL <- merge( D_ni, D_SMALL, by='date' )
+Dnorm_BIG <- merge( D_ni, D_BIG, by='date' )
+
+Dnorm_SMALL$normalized_bw <- NA; Dnorm_SMALL$normalized_bw <- 
Dnorm_SMALL$bw_md / Dnorm_SMALL$avr_download_KBps
+Dnorm_BIG$normalized_bw <- NA; Dnorm_BIG$normalized_bw <- Dnorm_BIG$bw_md / 
Dnorm_BIG$avr_download_KBps
+
+
diff --git a/2014/tor-growth/scripts/non-tor-bandwidth.R 
b/2014/tor-growth/scripts/non-tor-bandwidth.R
new file mode 100644
index 0000000..c062f55
--- /dev/null
+++ b/2014/tor-growth/scripts/non-tor-bandwidth.R
@@ -0,0 +1,70 @@
+rm( list = ls() )
+setwd('/Users/know/Desktop/tor analytics/non-tor-bw/')
+
+
+library(plyr) # for renaming columns
+
+#######################################################
+D_netindex <- 
read.csv('country_daily_speeds.csv')[c('date','country_code','download_kbps')]
+
+D_netindex$download_kbps <- D_netindex$download_kbps * (1000 / 1024)
+D_netindex$upload_kbps <- D_netindex$upload_kbps * (1000 / 1024)
+
+
+
+D_US = subset( D_netindex, country_code=='US' )
+D_DE = subset( D_netindex, country_code=='DE' )
+D_RU = subset( D_netindex, country_code=='RU' )
+
+# merge the US, DE, and RU bandwidths
+D_temp <- merge( D_US, D_DE, by='date' )
+D_COMPOSITE <- merge( D_temp, D_RU, by='date' )
+
+# drop the country codes
+D_COMPOSITE <- D_COMPOSITE[ , -which(names(D_COMPOSITE) %in% 
c('country_code.x','country_code.y','country_code'))]
+
+# average the download KiB/s entries into one
+D_COMPOSITE$avr_download_KBps <- NA
+D_COMPOSITE$avr_download_KBps <- (D_COMPOSITE$download_kbps.x + 
D_COMPOSITE$download_kbps.y + D_COMPOSITE$download_kbps) / 3.0
+
+# drop the country-specific download rates
+D_COMPOSITE <- D_COMPOSITE[ , -which(names(D_COMPOSITE) %in% 
c('download_kbps.x','download_kbps.y','download_kbps'))]
+
+
+
+plot( 1:nrow(D_US), log2(D_US$download_kbps), yaxt='n', xaxt='n', col='red', 
cex=0.7, xlab="Year", ylab="Mean download bandwidth (KiB/s)", pch=20 )
+points( 1:nrow(D_RU), log2(D_RU$download_kbps), yaxt='n', xaxt='n', 
col='blue', cex=0.7, pch=20 )
+points( 1:nrow(D_DE), log2(D_DE$download_kbps), yaxt='n', xaxt='n', 
col='orange', cex=0.7, pch=20 )
+
+points( 1:nrow(D_COMPOSITE), log2(D_COMPOSITE$avr_download_KBps), col='black', 
cex=0.5, pch=20 )
+
+####### Set the pretty Y-axis ############################
+par(las=1)
+#lab <- seq(from=1,to=45,by=1)
+lab <- c(12,12.5,13,13.5,14,14.5)
+axis(2,at=lab,labels=parse(text=paste("2^", lab, sep="")) )
+##########################################################
+
+
+####### Set the pretty X-axis ###################################
+YearLabels=c('2009','2010','2011','2012','2013','2014')
+YearLocations=c(367, 732, 1097, 1462, 1828, 2193)
+axis(1,at=YearLocations,labels=YearLabels )
+#################################################################
+
+
+legend_texts = c("United Staes", "Germany", "Russia", "Composite")
+legend( "topleft", legend=legend_texts, inset=0.01, pch=c(20,20), 
col=c('red','orange','blue','black') ) 
+
+
+
+day <- 1:nrow(D_US)
+
+mm <- lm( log2(download_kbps) ~ day, data=D_US )
+1.0 / (mm$coefficients["day"] * 30)
+
+mm <- lm( log2(download_kbps) ~ day, data=D_DE )
+1.0 / (mm$coefficients["day"] * 30)
+
+mm <- lm( log2(download_kbps) ~ day, data=D_RU )
+1.0 / (mm$coefficients["day"] * 30)
diff --git a/2014/tor-growth/scripts/relays.R b/2014/tor-growth/scripts/relays.R
new file mode 100644
index 0000000..f95ba77
--- /dev/null
+++ b/2014/tor-growth/scripts/relays.R
@@ -0,0 +1,97 @@
+rm( list = ls() )
+setwd('/Users/know/Desktop/tor analytics/')
+library(car)    # for pretty plots
+#library(matlab) # for matlab function names
+#library(data.table) # for data.table like data.frame
+#library(xtable) # for exporting to LaTeX
+#library(gdata)
+#library(lmtest) # for testing linear models
+#library(calibrate)
+
+library(plyr) # for renaming columns
+
+#######################################################
+
+## Readin the data into data.frame D with columns: days, relays.stable, 
relays.fast, relays.all
+#######################################################
+Dstable <- read.csv('relays-stable.csv')[c('date','relays')]
+#Dfast <- read.csv('relays-fast.csv')[c('date','relays')]
+#Dtemp <- merge(Dstable, Dfast, by="date", suffixes=c('.stable','.fast') )
+Dall <- read.csv('relays-total.csv')[c('date','relays')]
+D <- merge( Dstable, Dall, by='date' )
+D <- rename(D, c("date"="day","relays"="relays.all"))
+names(D)
+
+plot(D$day, log2(D$relays.all), ylab='Number of Relays', xlab='Year', 
yaxt='n', pch=20, cex=0.6, xaxt='n', col='blue', ylim=c(8,13) ) 
+
+points( D$day, log2(D$relays.stable), pch=20, cex=0.6, col='purple' )
+
+
+plot(1:nrow(Dall), log2(Dall$relays), ylab='Number of Relays', xlab='Year', 
yaxt='n', pch=20, cex=0.6, xaxt='n', col='blue', ylim=c(8,13) )
+
+
+####### Set the pretty X-axis ###################################
+YearLabels=seq(from=2008,to=2014,by=1)
+YearLocations=c(66,432,797,1162,1527,1893,2258)
+axis(1,at=YearLocations,labels=YearLabels )
+#################################################################
+
+####### Set the pretty Y-axis ###################################
+par(las=1)
+lab <- seq(from=1,to=45,by=1)
+axis(2,at=lab,labels=parse(text=paste("2^", lab, sep="")) )
+#################################################################
+
+
+
+
+
+
+## Set the Legend and Title
+##################################################################
+legend_texts = c(
+  expression(paste("All relays         ", r^2, "=0.96")),
+  expression(paste("Stable relays   ", r^2, "=0.93"))  
+)
+
+legend( "topleft", legend=legend_texts, inset=0.05, pch=c(20,20), 
col=c('blue','purple') ) 
+
+multiTitle(color="black","Number of Tor relays doubles every ", 
+           color="purple","2.1",
+           color="black",'-',
+           color="blue","2.6",
+           color="black"," years" )
+
+
+
+####### Plot the best-fit lines ############################
+
+# remove data before year 2010
+FD <- D[ which(D$day >= 750), ]
+
+# remove points with super-high-residuals from the fitted line
+fit_all <- lm( log2(relays.all) ~ day, data=FD )
+rows_to_remove <- abs(resid(fit_all)) > 0.38
+FD <- FD[ !rows_to_remove, ]
+
+
+# fit to a linear model
+fit_all <- lm( log2(relays.all) ~ day, data=FD )
+fit_stable <- lm( log2(relays.stable) ~ day, data=FD )
+
+
+# Add the best-fit lines
+first_day <- min(FD$day)
+segments( first_day, predict(fit_all, data.frame(day=first_day)),
+          max(FD$day), predict(fit_all, data.frame(day=max(FD$day))),
+          col="black", lty=2, lwd=3 )
+segments( first_day, predict(fit_stable, data.frame(day=first_day)),
+          max(FD$day), predict(fit_stable, data.frame(day=max(FD$day))),
+          col="black", lty=2, lwd=3 )
+
+# Add the black squares
+points( first_day, predict(fit_all, data.frame(day=min(FD$day))), col="black", 
pch=15, cex=1.3)
+points( first_day, predict(fit_stable, data.frame(day=min(FD$day))), 
col="black", pch=15, cex=1.3)
+
+#summary( fit_all )
+#summary( fit_stable )
\ No newline at end of file
diff --git a/2014/tor-growth/scripts/torperf.R 
b/2014/tor-growth/scripts/torperf.R
new file mode 100644
index 0000000..eda9162
--- /dev/null
+++ b/2014/tor-growth/scripts/torperf.R
@@ -0,0 +1,136 @@
+rm( list = ls() )
+setwd('/Users/know/Desktop/tor analytics/')
+library(car)    # for pretty plots
+library(matlab) # for matlab function names
+library(data.table) # for data.table like data.frame
+#library(xtable) # for exporting to LaTeX
+#library(gdata)
+library(lmtest) # for testing linear models
+library(calibrate)
+
+library(plyr) # for renaming columns
+source("colortitles.R")
+#######################################################
+
+process_torperf_rawdata <- function( filename, filesize_to_consider=5242880 )
+{
+  
+  # Import data from TorPerf
+  Dtp <- read.csv( filename )[c('day','date','size','source','q1','md','q3')]
+  Dtp <- rename(Dtp, c("size"="filesize") )
+  
+  print(unique(Dtp$filesize))
+  # only use the aggregated Tor data for downloading a 5 MiB file
+  Dtp <- subset( Dtp, source=='' & filesize==filesize_to_consider )
+  
+  #print( tail(Dtp) )
+  
+  # drop the source and filesize column
+  #Dtp <- Dtp[ , -which(names(Dtp) %in% c('source','filesize'))]
+  Dtp <- Dtp[ , -which(names(Dtp) %in% c('source'))]
+  
+  # rename the q1, md, and q3 for TIME
+  Dtp <- rename(Dtp, c("q1"="time_q1","md"="time_md","q3"="time_q3") )
+  
+  # convert time from MILLISECONDS -> SECONDS
+  Dtp$time_q1 <- Dtp$time_q1/1000
+  Dtp$time_md <- Dtp$time_md/1000
+  Dtp$time_q3 <- Dtp$time_q3/1000
+  
+  
+  # now create the bw_q1, bw_md, bw_q3 in: KiB/s
+  Dtp[c("bw_q1","bw_md","bw_q3")] <- c(NA,NA,NA)
+  
+  #convert my_filesize to 
+  
+  # Rewrite q1, md, and q3 to be in bandwidth (KiB/s)
+  Dtp$bw_q1 <- (filesize_to_consider / 1024) / Dtp$time_q1;
+  Dtp$bw_md <- (filesize_to_consider / 1024) / Dtp$time_md;
+  Dtp$bw_q3 <- (filesize_to_consider / 1024) / Dtp$time_q3;
+  
+  return(Dtp)
+}
+
+# get the two groups
+D_BIG <- process_torperf_rawdata('torperf-clean.csv', 5242880)
+D_MED <- process_torperf_rawdata('torperf-clean.csv', 1048576)
+D_SMALL <- process_torperf_rawdata('torperf-clean.csv', 51200)
+
+
+# DO ANALYSIS FROM HERE
+#######################################################
+
+plot(D_BIG$day, log2(D_BIG$bw_md), xlab='Year', ylab="Torperf bandwidth 
(KiB/s)", pch='', xaxt='n', yaxt='n', ylim=c(2.5,9) )
+#points(D_BIG$day, log2(D_BIG$bw_md), pch=20, col='blue', cex=0.4 )
+#points(D_MED$day, log2(D_MED$bw_md), pch=20, col='purple', cex=0.4 )
+#points(D_SMALL$day, log2(D_SMALL$bw_md), pch=20, col='red', cex=0.4 )
+
+
+points( smooth.spline(D_BIG$day, log2(D_BIG$bw_md)), pch=20, col='blue', 
cex=0.5 )
+points( smooth.spline(D_MED$day, log2(D_MED$bw_md)), pch=20, col='purple', 
cex=0.5 )
+points( smooth.spline(D_SMALL$day, log2(D_SMALL$bw_md)), pch=20, col='red', 
cex=0.5 )
+
+
+####### Set the pretty X-axis and Y-axis ############################
+YearLabels=seq(from=2010,to=2014,by=1)
+YearLocations=c(182,547,912,1278,1643)
+axis(1,at=YearLocations,labels=YearLabels )
+
+par(las=1)
+lab <- seq(from=-5,to=10,by=1)
+labels <- parse(text=paste("2^", lab, sep="") )
+axis(2,at=lab,labels=labels )
+
+
+
+
+####### Plot the best-fit lines ############################
+Dfit <- subset( D_BIG, day>=547 )
+first_fit_day <- min(Dfit$day)
+fit_BIG <- lm( log2(bw_md) ~ day, data=Dfit )
+
+
+# Add the best-fit line (with black square) for the advertised bandwidth
+segments( min(Dfit$day), predict(fit_BIG, data.frame(day=min(Dfit$day))),
+          max(Dfit$day), predict(fit_BIG, data.frame(day=max(Dfit$day))),
+          col="black", lty=2, lwd=3 )
+points( min(Dfit$day), predict(fit_BIG, data.frame(day=min(Dfit$day))), 
col="black", pch=15, cex=1.3)
+
+
+####################################################################
+
+Dfit <- subset( D_MED, day>=547 )
+first_fit_day <- min(Dfit$day)
+fit_MED <- lm( log2(bw_md) ~ day, data=Dfit )
+
+
+# Add the best-fit line (with black square) for the advertised bandwidth
+segments( min(Dfit$day), predict(fit_MED, data.frame(day=min(Dfit$day))),
+          max(Dfit$day), predict(fit_MED, data.frame(day=max(Dfit$day))),
+          col="black", lty=2, lwd=3 )
+points( min(Dfit$day), predict(fit_MED, data.frame(day=min(Dfit$day))), 
col="black", pch=15, cex=1.3)
+
+
+####################################################################
+
+Dfit <- subset( D_SMALL, day>=547 )
+fit_SMALL <- lm( log2(bw_md) ~ day, data=Dfit )
+
+# Add the best-fit line (with black square) for the advertised bandwidth
+segments( min(Dfit$day), predict(fit_SMALL, data.frame(day=min(Dfit$day))),
+          max(Dfit$day), predict(fit_SMALL, data.frame(day=max(Dfit$day))),
+          col="black", lty=2, lwd=3 )
+points( min(Dfit$day), predict(fit_SMALL, data.frame(day=min(Dfit$day))), 
col="black", pch=15, cex=1.3)
+
+
+
+legend_texts = c(
+  expression(paste("5   MiB ", r^2, "=0.46")),
+  expression(paste("1   MiB ", r^2, "=0.48")),
+  expression(paste("50 KiB ", r^2, "=0.55"))  
+)
+
+
+legend( "topleft", legend=legend_texts, inset=0.03, pch=c(20,20), 
col=c('blue','purple','red') ) 
+
+
diff --git a/2014/tor-growth/tor-growth.tex b/2014/tor-growth/tor-growth.tex
new file mode 100644
index 0000000..593d103
--- /dev/null
+++ b/2014/tor-growth/tor-growth.tex
@@ -0,0 +1,245 @@
+\documentclass{tortechrep}
+\usepackage{afterpage}
+%\usepackage{subfloat}
+\usepackage{subfig}
+\usepackage{url}
+\usepackage{fullpage}
+\usepackage{amsmath}
+\usepackage{booktabs}
+%\usepackage{afterpage}
+%\usepackage{subcanonption}
+\usepackage{graphicx}
+
+
+
+%%%%%%%%% BLUE UNDERLINES
+\usepackage{color}  % << color package is required for blue underline
+\usepackage{ulem} % << ulem package is required for blue underline
+
+%Define a blue underline
+\newcommand{\blueuline}{\bgroup\markoverwith{\hbox{\kern-.03em\vtop%
+{\begingroup\kern.1ex\color{blue}\hrule width .2em\kern1.1pt 
\endgroup\kern-.03em}}}\ULon}
+%\newcommand\reduline{\bgroup\markoverwith
+%      {\textcolor{red}{\rule[-0.5ex]{2pt}{0.4pt}}}\ULon}
+
+\newcommand{\uhref}[2]{\href{#1}{\blueuline{#2}}}
+%%%%%%%%%%%%% END BLUE UNDERLINES
+
+
+
+\title{Tor growth rates and improving Torperf throughput}
+\author{Virgil Griffith}
+\reportid{2014-10-001}
+\date{October 04, 2014}
+
+\newcommand{\Figref}[1]{Figure~\ref{#1}}
+\newcommand{\figref}[1]{Figure~\ref{#1}}
+
+\begin{document}
+\maketitle
+
+\section{Preliminaries}
+Despite data being available from 
\uhref{http://metrics.torproject.org}{metrics.torproject.org} for sometime, 
there’s been little statistical analysis of that data.  Let’s fix that.  
From the Metrics data, the most obvious thing to plot is the number of relays 
over time, see \figref{fig:fig1}.  Plotting in logscale (so a straight line 
means exponential growth) reveals that the number of relays increases 
exponentially.  Good to know.  The ``stable relays'' are plotted in purple 
because they are fabulous.
+
+
+Next in \figref{fig:fig2} we chart the total network bandwidth over time.  
Tor's total network bandwidth doubles at a darn impressive 13--14 months!  
Moore's Law, doubling every 18 months, is downright torpid by comparison.
+
+ 
+Since 2010 the doubling rates for both relays and bandwidth have been 
remarkably consistent.  Although recognizing that there are unaccounted for 
sinusoidal trends, the fact remains that a simple fit of $y = m~\log(x) + b$ 
accounts for \textasciitilde 90\% of the variance!  Additionally, the 99\% 
confidence intervals on the predicted data are barely visible without a 
magnifying glass.  Extrapolation from statistics is a dangerous game, but 
realistically we can't expect these growth rates to be more predictable.  With 
this statistical bedrock under our feet, let's go deeper.  In \figref{fig:fig3} 
we see how the mean relay bandwidth grows over time.  We see that the mean 
relay bandwidth doubles about every two years.  This is akin to 
\uhref{http://www.nngroup.com/articles/law-of-bandwidth/}{Nielsen's Law} which 
states that for high-end home users, bandwidth doubles every two years.  Good 
job operators---those Tor-shirts are well earned!
+
+
+
+
+ 
+ 
+We see that the mean relay bandwidth increases by Nielsen's Law, but how does 
this impact client experience?  Fortunately, we have 
\uhref{https://metrics.torproject.org/performance.html}{Torperf data} to answer 
this.  Simple things first, and in \figref{fig:fig4} we plot Torperf bandwidth 
over time.  Torperf's fitted line isn't nearly as good a fit as the number of 
relays or total bandwidth (Figures \ref{fig:fig1} and \ref{fig:fig2}), but it 
conveys enough of the trend to be useful.  We see that, depending on file size, 
Torperf throughput doubles every 25--35 months.\footnote{It's not obvious that 
Torperf bandwidth increases exponentially, but given that bandwidth and CPU are 
the primary factors in Torperf and that each of these follow their respective 
exponential curves, it's reasonable to err on the side of an exponential fit 
over a linear one.  Statistical modeling often leverages domain knowledge.}  
Given such a wide spread in Figure \figref{fig:fig4}, we will separately conside
 r the Torperf bandwidth for downloading a 50 KiB and 5 MiB file.  Lets go 
deeper.
+ 
+ 
+
+
+Absolute Torperf improvements are great to see, but the key measure is how 
Torperf throughput compares with clients’ non-Tor throughput.  From 
\uhref{http://www.netindex.com/}{OOKLA bandwidth data} we calculate the 
composite mean download rate for the three countries with the greatest number 
of Tor clients: United States, Germany, and Russia (\figref{fig:nonTor}).  With 
the composite non-Tor bandwidth in hand, we plot Torperf bandwidth normalized 
(divided) by the composite non-Tor bandwidth arriving at \figref{fig:fig5}.
+ 
+
+
+For smaller files (50 KiB), we see that although absolute Torperf has been 
doubling every 35 months, normalized Torperf has been essentially flat.  For 
larger files (5 MiB), we see a gradual uptick in normalized Torperf.
+
+From the doubling rates of Torperf and composite non-Tor bandwidth we can 
derive the normalized Torperf growth rates analytically.  Taking the ratio of 
two exponentials of the form $y = 2^{(1/n) x}$ where $n$ is the doubling rate, 
we get $y = 2^{(1/n - 1/m) x}$ where $n$ and $m$ are the doubling rates of 
Torperf bandwidth and composite non-Tor bandwidth respectively.  This results 
in normalized Torperf doubling every $20$ years for small files and doubling 
every $5$ years for large files.  To put a five year doubling rate in 
perspective, this means Torperf will reach $5\%$ of non-Tor bandwidth around 
year 2022.  Internal optimizations like the 
\uhref{http://www.robgjansen.com/publications/kist-sec2014.pdf}{KIST scheduler} 
are great steps to improve this.
+ 
+\section{Will adding advertised bandwidth improve Torperf?}
+ 
+There have been 
\uhref{https://blog.torproject.org/blog/tor-incentives-research-roundup-goldstar-par-braids-lira-tears-and-torcoin}{various
 proposals} for improving client speeds by adding operator incentives beyond 
the established 
\uhref{https://www.torproject.org/getinvolved/tshirt.html}{T-shirts} and 
\uhref{https://blog.torservers.net/20131213/torservers-awarded-250000-by-digital-defenders.html}{financial
 grants}.  Our final analysis is an attempt to predict whether adding more 
advertised relay bandwidth would reliably improve Torperf throughput.
+
+
+We've established that absolute Torperf improves on its own due to the 
increasing bandwidth of relays.  Our first step to blunt the influence of 
increasing relay bandwidth is to always look at the \emph{normalized} Torperf 
performance.  We explored several different predictors of normalized Torperf, 
and the most promising was proportion of total read bandwidth to total 
advertised bandwidth, or the Network Utilization Ratio (NUR).  We plot 
normalized Torperf as a function of NUR in \figref{fig:fig6}.
+
+
+
+
+We see that NUR doesn't predict much of the normalized bandwidth for small (50 
KiB) files.  However, for large files (5 MiB), there's a fuzzy yet definite 
trend of ``lower NUR means higher normalized Torperf''.  But there's a risk, we 
see that the lowest NUR data points (purple) are all from 2014.  Therefore NUR 
could be acting as a mere proxy for the gradual (yet slow per 
\figref{fig:fig5}) improvement of normalized Torperf over time.
+
+We control for this using a two-factor ANOVA using \texttt{DATE} and 
\texttt{NUR} as the two factors and normalized Torperf as the dependent 
variable.  For the stats-literate, the full ANOVA tables are given in Table 
\ref{tbl:anovatables}, but the take-home message is that \texttt{NUR} provides 
substantial predictive power for normalized Torperf even after accounting for 
\texttt{DATE}.  Concretely, while the single-factor model using \texttt{DATE} 
has an $r^2$ of $0.02$ (50 KiB) and $0.14$ (5 MiB), the two-factor model using 
\texttt{DATE} and \texttt{NUR} yields an $r^2$ of $0.17$ and $0.44$---a $750\%$ 
and $208\%$ improvement respectively.  This allows us to tentatively conclude 
that a sudden uptick in advertised bandwidth would improve normalized Torperf 
beyond the glacial ascent seen in \figref{fig:fig5}.\footnote{Unsurprisingly, 
there's some caveats to this conclusion.  Our argument presumes that the 
distribution of advertised bandwidth across relays is constant---for example, T
 orperf would not improve if $10^{12}$ new relays joined the consensus but each 
provided only $1$ B/s.  We're aware of no evidence indicating this assumption 
is unrealistic.}
+
+
+\begin{table}[hbt]
+\centering
+\subfloat[50 KiB.  For aggregate model $r^2=0.17$.] {
+\begin{tabular}{ l l l l l l } \toprule
+ & df    & Sum Sq & Mean Sq & F-value & p-value \\
+\midrule
+\texttt{DATE}    & 1     & 0.04039      & 0.040390    & \ \ 38.418  & 
7.309\textsc{e}$^{-10}$ \\
+\texttt{NUR}    & 1     & 0.29005      & 0.290045    & 275.887  & 
2\textsc{e}$^{-16}$ \\
+Residuals & 1546     & 1.62534      & 0.001051    &   &  \\
+\bottomrule
+\end{tabular} }
+
+%\vskip
+\bigskip
+
+\subfloat[5 MiB.  For aggregate model $r^2=0.44$.]{
+\begin{tabular}{ l l l l l l } \toprule
+ & df    & Sum Sq & Mean Sq & F-value & p-value \\
+\midrule
+\texttt{DATE}    & 1     & 27.395      & 27.395    & 401.09  & 
2\textsc{e}$^{-16}$ \\
+\texttt{NUR}    & 1     & 56.750      & 56.750    & 830.87  & 
2\textsc{e}$^{-16}$ \\
+Residuals & 1546     & 105.595      & 0.068   &   &  \\
+\bottomrule
+\end{tabular} }
+
+\caption{ANOVA tables predicting normalized Torperf for downloading a 50 KiB 
and 5 MiB file.}
+\label{tbl:anovatables}
+\end{table}
+
+
+ 
+\section{Summary}
+We've learned a few things.
+
+\begin{enumerate} 
+    \item Many aspects of Tor follow exponential growth.  Table 
\ref{tbl:summary} summarizes these results.  Additionally, Tor bandwidth 
currently sits at $<2\%$ of mean non-Tor bandwidth.
+
+    \item Tor clients' absolute throughput is steadily improving.  However, 
after normalizing by mean non-Tor bandwidth, this improvement is greatly 
diminished.  For small files, normalized Torperf has been essentially flat 
since records have been kept.
+
+    \item An intervention to increase advertised bandwidth would noticeably 
improve normalized Torperf for large \emph{as well as small} files.
+\end{enumerate}
+
+
+\begin{table}
+\centering
+\begin{tabular}{ l l l l } \toprule
+ & Doubling rate    & $\ \ r^2$ \\
+% & (years)    &  \\ 
+\midrule
+Total advertised bandwidth & \ \;1.2 \ years &  0.96 \\
+Mean relay bandwidth & \ \;2\ \ \; \ years &  0.91 \\
+Number of relays (all) & \ \;3\ \ \; \ years & 0.94 \medskip \\
+
+Absolute Torperf (5 MiB) & \ \;2\ \ \; \ years & 0.46  \\
+Absolute Torperf (50 KiB) & \ \;3\ \ \; \ years & 0.55 \medskip \\
+
+Mean RU download bandwidth & \ \;3.1 \ years & 0.95 \\
+Mean US download bandwidth & \ \;3.4 \ years & 0.97 \\
+Mean DE download bandwidth & \ \;3.9 \ years & 0.88 \\
+Composite download bandwidth & \ \;3.5 \ years & 0.97 \medskip \\
+
+Normalized Torperf (5 MiB) & \ \;5 \ \ \;\ years & \ \ - \\
+Normalized Torperf (50 KiB) & 19.9 \ years & \ \ - \\
+\bottomrule
+\end{tabular} 
+
+\caption{Summary of growth rates}
+\label{tbl:summary}
+\end{table}
+
+
+
+\section{Future Work}
+\label{sect:fw}
+Some natural extensions to this work are:
+\begin{itemize}
+    \item Instead of looking at the \emph{mean} relay bandwidth, instead 
separately calculate the \emph{expected} bandwidth for the guard, middle, and 
exit node positions.
+    \item It'd be nice to characterize the \emph{distribution} of advertised 
bandwidth.  Does it follow a Gaussian?  Pareto?  It'd be nice to know.
+    \item When computing the composite non-Tor bandwidth, instead of doing an 
unweighted average of the United Staes, Germany, and Russia, it'd be better to 
do a \emph{weighted average} among all countries in which each country is 
weighted by its number of originating Tor clients.  We doubt this would change 
the conclusions.
+    \item Tor's Network Utilization Ratio (NUR), shown in \figref{fig:fig8}, 
has clear drops of unclear cause.  Given how predictive NUR is of normalized 
Torperf, we'd like to know the causes of the two drops in NUR on 2013-10-09 and 
2014-06-06.
+\end{itemize}
+
+\flushleft \textbf{Acknowledgements.}  We thank Roger Dingledine and Karsten 
Loesing for their help and review.  All analyses were done in R.
+
+\clearpage
+
+%%%%%%%%%%%%%%%%%
+%% FIGURES
+%%%%%%%%%%%%%%%%%
+
+\begin{figure}[h!bt]
+    \centering
+    \includegraphics[height=3.55in]{figs/fig1-relays.png}
+    \caption{The number of Tor relays increases exponentially, doubling every 
2 years (stable) to 2.5 years (all).}
+    \label{fig:fig1}    
+\end{figure}
+\begin{figure}[h!bt]
+    \centering
+    \includegraphics[height=3.55in]{figs/fig2-bw.png}
+    \caption{Total network bandwidth also increases exponentially.}
+    \label{fig:fig2}    
+\end{figure}
+
+\begin{figure}[h!bt]
+    \centering
+    \includegraphics[height=5in]{figs/fig3-mean-bw.png}
+    \caption{Mean relay bandwidth increases exponentially and doubles 
approximately every 24 months.}
+    \label{fig:fig3}    
+\end{figure}
+
+\begin{figure}[h!bt]
+    \centering
+    \includegraphics[height=5in]{figs/fig4--torperf.png}
+    \caption{Absolute Torperf throughput increases exponentially, doubling 
every 25 months for 5 MiB files and every 35 months for 50 KiB files.  
Unfortunately, the throughput when downloading a 50 KiB file is \textasciitilde 
8x slower than downloading a 5 MiB file.  These trends imply that these two 
rates will continue to diverge.}
+    \label{fig:fig4}    
+\end{figure}
+
+
+\begin{figure}[h!bt]
+    \centering
+    \includegraphics[height=5in]{figs/5a--normalized-torperf-small.png}
+    \caption{The normalized Torperf for 50 KiB and 5 MiB files.}
+    \label{fig:fig5}     
+\end{figure}
+
+
+\begin{figure}
+    \centering
+    \subfloat[50 KiB; $r^2=0.15$.]{ 
\includegraphics[height=4in]{figs/fig6--NUR-predicts-normtorperf_small.png} 
\label{fig:6a} }
+    
+    \subfloat[5 MiB; $r^2=0.44$.]{ 
\includegraphics[height=4in]{figs/fig6--NUR-predicts-normalized-torperf_large.png}
 \label{fig:6b} }    
+    \caption{Low NUR imples higher normalized Torperf---especially so for 
larger files.}
+    \label{fig:fig6}
+\end{figure}
+
+
+
+\begin{figure}[h!bt]
+    \centering
+    \includegraphics[height=5in]{figs/non-tor-bw.png}
+    \caption{Mean download bandwidth for United States, Germany, and Russia 
according to netindex.com.  Composite is the mean of all three.}
+    \label{fig:nonTor}    
+\end{figure}
+
+
+\begin{figure}[h!bt]
+    \centering
+    \includegraphics[height=5in]{figs/appendix--NUR-over-time.png}
+    \caption{Network Utilization Ratio (NUR) falls into three distinct stages. 
 Within each stage the fitted line is essentially flat.  What happened on 
2013-10-08 and 2014-06-06!?  The only thing we see is that on 2014-06-05 (one 
day prior) the EFF began their Tor Challenge.}
+    \label{fig:fig8}    
+\end{figure}
+
+
+
+
+
+
+
+
+
+
+
+\end{document}
diff --git a/2014/tor-growth/tortechrep.cls b/2014/tor-growth/tortechrep.cls
new file mode 120000
index 0000000..4c24db2
--- /dev/null
+++ b/2014/tor-growth/tortechrep.cls
@@ -0,0 +1 @@
+../../tortechrep.cls
\ No newline at end of file



_______________________________________________
tor-commits mailing list
[email protected]
https://lists.torproject.org/cgi-bin/mailman/listinfo/tor-commits

Reply via email to