This is an automated email from the git hooks/post-receive script.

afif pushed a commit to branch master
in repository nanopolish.

commit 436609ff75aa5294791d7fd8433376eb27965f20
Author: Afif Elghraoui <a...@debian.org>
Date:   Sun Feb 4 16:05:33 2018 -0500

    New upstream version 0.8.5
---
 Makefile                                           |  26 +-
 README.md                                          |  11 +-
 docs/Makefile                                      |  20 +
 docs/source/_static/nanopolish-workflow.png        | Bin 0 -> 318184 bytes
 .../_static/quickstart_methylation_results.png     | Bin 0 -> 50765 bytes
 docs/source/conf.py                                | 168 +++++++++
 docs/source/debug.rst                              | 170 +++++++++
 docs/source/index.rst                              |  30 ++
 docs/source/installation.rst                       |  45 +++
 docs/source/manual.rst                             | 413 +++++++++++++++++++++
 docs/source/quickstart_call_methylation.rst        | 125 +++++++
 docs/source/quickstart_consensus.rst               | 115 ++++++
 docs/source/quickstart_eventalign.rst              | 103 +++++
 scripts/calculate_methylation_frequency.py         |  11 +-
 scripts/extract_reads_aligned_to_region.py         | 309 +++++++++++++++
 scripts/nanopolish_merge.py                        |  17 +-
 src/alignment/nanopolish_alignment_db.cpp          |   1 +
 src/alignment/nanopolish_eventalign.cpp            | 134 ++-----
 src/common/nanopolish_bam_processor.cpp            |   3 +-
 src/common/nanopolish_common.h                     |   2 +-
 src/main/nanopolish.cpp                            |   9 +-
 src/nanopolish_call_variants.cpp                   |   6 +-
 src/nanopolish_extract.cpp                         |  70 ++--
 src/nanopolish_index.cpp                           |  18 +-
 src/nanopolish_raw_loader.cpp                      |  12 +-
 src/nanopolish_read_db.cpp                         |  19 +-
 src/nanopolish_squiggle_read.cpp                   |  46 ++-
 27 files changed, 1703 insertions(+), 180 deletions(-)

diff --git a/Makefile b/Makefile
index 5efffe6..3cab7a6 100644
--- a/Makefile
+++ b/Makefile
@@ -10,14 +10,15 @@ SUBDIRS := src src/hmm src/thirdparty 
src/thirdparty/scrappie src/common src/ali
 #Basic flags every build needs
 LIBS=-lz
 CXXFLAGS ?= -g -O3
-CXXFLAGS += -std=c++11 -fopenmp
+CXXFLAGS += -std=c++11 -fopenmp -fsigned-char
 CFLAGS ?= -O3 -std=c99
 CXX ?= g++
 CC ?= gcc
 
-# Change the value of HDF5 or EIGEN below to any value to disable compilation 
of bundled HDF5 code
-HDF5=install
-EIGEN=install
+# Change the value of HDF5, EIGEN, or HTS below to any value to disable 
compilation of bundled code
+HDF5?=install
+EIGEN?=install
+HTS?=install
 
 # Check operating system, OSX doesn't have -lrt
 UNAME_S := $(shell uname -s)
@@ -45,9 +46,14 @@ else
     EIGEN_CHECK=
 endif
 
-# Build and link the libhts submodule
-HTS_LIB=./htslib/libhts.a
-HTS_INCLUDE=-I./htslib
+# Default to build and link the libhts submodule
+ifeq ($(HTS), install)
+    HTS_LIB=./htslib/libhts.a
+    HTS_INCLUDE=-I./htslib
+else
+    # Use system-wide htslib
+    HTS_LIB=-lhts
+endif
 
 # Include the header-only fast5 library
 FAST5_INCLUDE=-I./fast5/include
@@ -116,15 +122,15 @@ include .depend
        $(CXX) -o $@ -c $(CXXFLAGS) $(CPPFLAGS) -fPIC $<
 
 .c.o:
-       $(CC) -o $@ -c $(CFLAGS) $(H5_INCLUDE) -fPIC $<
+       $(CC) -o $@ -c $(CFLAGS) $(CPPFLAGS) $(H5_INCLUDE) -fPIC $<
 
 # Link main executable
 $(PROGRAM): src/main/nanopolish.o $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) 
$(EIGEN_CHECK)
-       $(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -fPIC $< $(CPP_OBJ) $(C_OBJ) 
$(HTS_LIB) $(H5_LIB) $(LIBS)
+       $(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -fPIC $< $(CPP_OBJ) $(C_OBJ) 
$(HTS_LIB) $(H5_LIB) $(LIBS) $(LDFLAGS)
 
 # Link test executable
 $(TEST_PROGRAM): src/test/nanopolish_test.o $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) 
$(H5_LIB)
-       $(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -fPIC $< $(CPP_OBJ) $(C_OBJ) 
$(HTS_LIB) $(H5_LIB) $(LIBS)
+       $(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -fPIC $< $(CPP_OBJ) $(C_OBJ) 
$(HTS_LIB) $(H5_LIB) $(LIBS) $(LDFLAGS)
 
 test: $(TEST_PROGRAM)
        ./$(TEST_PROGRAM)
diff --git a/README.md b/README.md
index 0564f32..b3433c4 100644
--- a/README.md
+++ b/README.md
@@ -6,15 +6,16 @@ Software package for signal-level analysis of Oxford Nanopore 
sequencing data. N
 
 ## Dependencies
 
-[libhdf5](http://www.hdfgroup.org/HDF5/release/obtain5.html) is automatically 
downloaded and compiled when running `make` but this can be disabled with: 
`HDF5=nofetch make`. The nanopolish binary will link libhdf5.a statically.
+A compiler that supports C++11 is needed to build nanopolish. Development of 
the code is performed using [gcc-4.8](https://gcc.gnu.org/gcc-4.8/).
 
-[eigen](http://eigen.tuxfamily.org) is also automatically downloaded and 
included when compiling with `make`.
+By default, nanopolish will download and compile all of its required 
dependencies. Some users however may want to use system-wide versions of the 
libraries. To turn off the automatic installation of dependencies set 
`HDF5=noinstall`, `EIGEN=noinstall` or `HTS=noinstall` parameters when running 
`make` as appropriate. The current versions and compile options for the 
dependencies are:
 
-[biopython](http://www.biopython.org) is required to run the helpers in 
`scripts/`.
+* [libhdf5-1.8.14](http://www.hdfgroup.org/HDF5/release/obtain5.html) compiled 
with multi-threading support `--enable-threadsafe`
+* [eigen-3.2.5](http://eigen.tuxfamily.org)
+* [htslib-1.4](http://github.com/samtools/htslib) 
 
-[htslib](http://github.com/samtools/htslib) is included as a submodule and 
compiled automatically.
+Additionally the helper `scripts` require 
[biopython](http://www.biopython.org) and 
[pysam](http://pysam.readthedocs.io/en/latest/installation.html).
 
-A compiler that supports C++11 is needed to build the sources. Development of 
the code is performed using [gcc-4.8](https://gcc.gnu.org/gcc-4.8/).
 
 ## Installation instructions
 
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..4f4c5de
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SPHINXPROJ    = Nanopolish
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+       @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+       @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/docs/source/_static/nanopolish-workflow.png 
b/docs/source/_static/nanopolish-workflow.png
new file mode 100644
index 0000000..34c8a6d
Binary files /dev/null and b/docs/source/_static/nanopolish-workflow.png differ
diff --git a/docs/source/_static/quickstart_methylation_results.png 
b/docs/source/_static/quickstart_methylation_results.png
new file mode 100644
index 0000000..05dc733
Binary files /dev/null and 
b/docs/source/_static/quickstart_methylation_results.png differ
diff --git a/docs/source/conf.py b/docs/source/conf.py
new file mode 100644
index 0000000..d4d41d5
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,168 @@
+# -*- coding: utf-8 -*-
+#
+# Nanopolish documentation build configuration file, created by
+# sphinx-quickstart on Tue Nov 14 14:59:25 2017.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+import sphinx_rtd_theme
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ['sphinx.ext.autodoc']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'Nanopolish'
+copyright = u'2017, Simpson Lab'
+author = u'Simpson Lab'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = u'0.8.4'
+# The full version, including alpha/beta/rc tags.
+release = u'0.8.4'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = []
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = "sphinx_rtd_theme"
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+#html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# This is required for the alabaster theme
+# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
+html_sidebars = {
+    '**': [
+        'relations.html',  # needs 'show_related': True theme option to display
+        'searchbox.html',
+    ]
+}
+
+
+# -- Options for HTMLHelp output ------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'Nanopolishdoc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'Nanopolish.tex', u'Nanopolish Documentation',
+     u'Simpson Lab', 'manual'),
+]
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'nanopolish', u'Nanopolish Documentation',
+     [author], 1)
+]
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'Nanopolish', u'Nanopolish Documentation',
+     author, 'Nanopolish', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+
+
diff --git a/docs/source/debug.rst b/docs/source/debug.rst
new file mode 100644
index 0000000..005f763
--- /dev/null
+++ b/docs/source/debug.rst
@@ -0,0 +1,170 @@
+.. _help_us_debug:
+
+Helping us debug nanopolish
+===============================
+
+Overview
+"""""""""""""""""""""""
+
+Running into errors with nanopolish? To help us debug, we need to be able to 
reproduce the errors. We can do this by packaging a subset of the files that 
were used by a nanopolish. We have provided 
``scripts/extract_reads_aligned_to_region.py`` and this tutorial to help you do 
exactly this.
+
+Briefly, this script will:
+
+* extract reads that align to a given region in the draft genome assembly
+* rewrite a new BAM, BAI, FASTA file with these reads
+* extract the FAST5 files associated with these reads
+* save all these files into a tar.gz file
+
+Workflow
+"""""""""""""
+
+#. Narrow down a problematic region by running ``nanopolish variants 
--consensus [...]`` and changing the ``-w`` parameter.
+#. Run the ``scripts/extract_reads_aligned_to_region.py``.
+#. Send the resulting ``tar.gz`` file to us by hosting either a dropbox or 
google drive.
+
+.. _creating_example_dataset:
+
+Tutorial - using extraction helper script to create example datsets
+""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+
+We extracted a subset of reads for a 2kb region to create the example dataset 
for the eventalign and consensus tutorial using 
``scripts/extract_reads_aligned_to_region.py``. Here is how:
+
+|
+
+Generated basecalled ``--reads`` file:
+
+#. Basecalled reads with albacore: ::
+
+    read_fast5_basecaller.py -c r94_450bps_linear.cfg -t 8 -i 
/path/to/raw/fast5/files -s /path/to/albacore-2.0.1/output/directory -o fastq 
+
+#. Merged the different albacore fastq outputs: ::
+
+    cat /path/to/albacore-2.0.1/output/directory/workspace/pass/*.fastq > 
albacore-2.0.1-merged.fastq
+
+#. Converted the merged fastq to fasta format: ::
+
+    paste - - - - < albacore-2.0.1-merged.fastq | cut -f 1,2 | sed 's/^@/>/' | 
tr "\t" "\n" > reads.fasta
+
+|
+
+Generated ``--bam`` file with the draft genome assembly (``-g``):
+
+#. Ran canu to create draft genome assembly: ::
+
+    canu \
+        -p ecoli -d outdir genomeSize=4.6m \
+        -nanopore-raw reads.fasta \ 
+
+#. Index draft assembly: ::
+
+    bwa index ecoli.contigs.fasta
+    samtools faidx ecoli.contigs.fasta
+
+#. Aligned reads to draft genome assembly with bwa (v0.7.12): ::
+
+    bwa mem -x ont2d ecoli.contigs.fasta reads.fasta | samtools sort -o 
reads.sorted.bam -T reads.tmp
+    samtools index reads.sorted.bam
+
+|
+
+Selected a ``--window``:
+
+#. Identified the first contig name and chose a random start position: ::
+
+    head -3 ecoli.contigs.fasta
+
+Output: ::
+
+    >tig00000001 len=4376233 reads=23096 covStat=7751.73 gappedBases=no 
class=contig suggestRepeat=no suggestCircular=no
+    
AGATGCTTTGAAAGAAACGCAGAATAGATCTCTATGTAATGATATGGAATACTCTGGTATTGTCTGTAAAGATACTAATGGAAAATATTTTGCATCTAAG
+    
GCAGAAACTGATAATTTAAGAAAGGAGTCATATCCTCTGAAAAGAAAATGTCCCACAGGTACAGATAGAGTTGCTGCTTATCATACTCACGGTGCAGATA
+ 
+As we wanted a 2kb region, we selected a random start position (200000) so our 
end position was 202000. Therefore our ``--window`` was 
"tig00000001:200000-202000".
+
+|
+
+Using the files we created, we ran 
``scripts/extract_reads_aligned_to_region.py``, please see usage example below.
+
+.. note:: Make sure nanopolish still reproduces the same error on this subset 
before sending it to us.
+
+Usage example
+"""""""""""""""""""""""
+::
+
+    python extract_reads_aligned_to_region.py \
+        --reads reads.fasta \
+        --genome ecoli.contigs.fasta \
+        --bam reads.sorted.bam \
+        --window "tig00000001:200000-202000" \
+        --output_prefix ecoli_2kb_region --verbose
+
+.. list-table:: 
+   :widths: 25 5 20 50
+   :header-rows: 1
+
+   * - Argument name(s)
+     - Req.
+     - Default value
+     - Description
+
+   * - ``-b``, ``--bam``
+     - Y
+     - NA
+     - Sorted bam file created by aligning reads to the draft genome.
+
+   * - ``-g``, ``--genome``
+     - Y
+     - NA
+     - Draft genome assembly
+
+   * - ``-r``, ``--reads``
+     - Y
+     - NA
+     - Fasta, fastq, fasta.gz, or fastq.gz file containing basecalled reads.
+
+   * - ``-w``, ``--window``
+     - Y
+     - NA
+     - Draft genome assembly coordinate string ex. "contig:start-end". It is 
essential that you wrap the coordinates in quotation marks (\").
+
+   * - ``-o``, ``--output_prefix``
+     - N
+     - reads_subset
+     - Prefix of output tar.gz and log file.
+
+   * - ``-v``, ``--verbose``
+     - N
+     - False
+     - Use for verbose output with info on progress.
+
+Script overview
+"""""""""""""""""""""
+
+#. Parse input files
+#. Assumes readdb file name from input reads file
+#. Validates input
+    - checks that input bam, readdb, fasta/q, draft genome assembly, draft 
genome assembly index file exist, are not empy, and are readable
+#. With user input draft genome assembly coordinates, extracts all reads that 
aligned within these coordinates stores the read_ids. This information can be 
found from the input BAM.
+    - uses pysam.AlignmentFile
+    - uses samfile.fetch(region=draft_ga_coords) to get all reads aligned to 
region
+    - if reads map to multiple sections within draft ga it is not added again
+#. Parses through the input readdb file to find the FAST5 files associated 
with each region that aligned to region
+    - stores in dictionary region_fast5_files; key = read_id, value = 
path/to/fast5/file
+    - path to fast5 file is currently dependent on the user's directory 
structure
+#. Make a BAM and BAI file for this specific region
+    - creates a new BAM file called ``region.bam``
+    - with pysam.view we rewrite the new bam with reads that aligned to the 
region...
+    - with pysam.index we create a new BAI file
+#. Now to make a new FASTA file with this subset of reads
+    - the new fasta file is called ``region.fasta``
+    - this first checks what type of sequences file is given { ``fasta``, 
``fastq``, ``fasta.gz``, ``fastq.gz`` }
+    - then handles based on type of seq file using SeqIO.parse
+    - then writes to a new fasta file
+#. Let's get to tarring
+    - creates a ``tar.gz`` file with the output prefix
+    - saves the fast5 files in directory ``output_prefix/fast5_files/``
+#. Adds the new fasta, new bam, and new bai file with the subset of reads
+#. Adds the draft genome asssembly and associated fai index file
+#. Performs a check
+    - the number of reads in the new BAM file, new FASTA file, and the number 
of files in the fast5_files directory should be equal
+#. Outputs a ``tar.gz`` and ``log`` file. FIN!
diff --git a/docs/source/index.rst b/docs/source/index.rst
new file mode 100644
index 0000000..1bce713
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,30 @@
+.. Nanopolish documentation master file, created by
+   sphinx-quickstart on Tue Nov 14 14:59:25 2017.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+nanopolish
+======================================
+`nanopolish <https://github.com/jts/nanopolish>`_ is a software package for 
signal-level analysis of Oxford Nanopore sequencing data. Nanopolish can 
calculate an improved consensus sequence for a draft genome assembly, detect 
base modifications, call SNPs and indels with respect to a reference genome and 
more (see Nanopolish modules, below).
+
+
+.. toctree::
+   :hidden:
+
+   installation
+   quickstart_consensus
+   quickstart_eventalign
+   quickstart_call_methylation
+   debug
+   manual
+
+Publications
+====================================
+
+* Loman, Nicholas J., Joshua Quick, and Jared T. Simpson. "A complete 
bacterial genome assembled de novo using only nanopore sequencing data." Nature 
methods 12.8 (2015): 733-735.
+* Quick, Joshua, et al. "Real-time, portable genome sequencing for Ebola 
surveillance." Nature 530.7589 (2016): 228-232.
+* Simpson, Jared T., et al. "Detecting DNA cytosine methylation using nanopore 
sequencing." nature methods 14.4 (2017): 407-410.
+
+Credits and Thanks
+========================
+The fast table-driven logsum implementation was provided by Sean Eddy as 
public domain code. This code was originally part of `hmmer3 
<http://hmmer.org/>`_ . Nanopolish also includes code from Oxford Nanopore's 
`scrappie <https://github.com/nanoporetech/scrappie>`_ basecaller. This code is 
licensed under the MPL.
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
new file mode 100644
index 0000000..36a1de9
--- /dev/null
+++ b/docs/source/installation.rst
@@ -0,0 +1,45 @@
+.. _installation:
+
+Installation
+=======================
+
+Dependencies
+-----------------------
+
+A compiler that supports C++11 is needed to build nanopolish. Development of 
the code is performed using `gcc-4.8 <https://gcc.gnu.org/gcc-4.8/>`_.
+
+By default, nanopolish will download and compile all of its required 
dependencies. Some users however may want to use system-wide versions of the 
libraries. To turn off the automatic installation of dependencies set 
`HDF5=noinstall`, `EIGEN=noinstall` or `HTS=noinstall` parameters when running 
`make` as appropriate. The current versions and compile options for the 
dependencies are:
+
+* `libhdf5-1.8.14 <http://www.hdfgroup.org/HDF5/release/obtain5.html>`_ 
compiled with multi-threading support ``--enable-threadsafe``
+* `eigen-3.2.5 <http://eigen.tuxfamily.org/>`_ 
+* `htslib-1.4 <http://github.com/samtools/htslib>`_
+
+Additionally the helper `scripts` require `biopython <http://biopython.org/>`_ 
and `pysam <http://pysam.readthedocs.io/en/latest/installation.html>`_.
+
+Installing the latest code from github (recommended)
+------------------------------------------------------
+You can download and compile the latest code from github as follows ::
+
+    git clone --recursive https://github.com/jts/nanopolish.git
+    cd nanopolish
+    make
+
+Installing a particular release
+------------------------------------------------------
+When major features have been added or bugs fixed, we will tag and release a 
new version of nanopolish. If you wish to use a particular version, you can 
checkout the tagged version before compiling ::
+
+    git clone --recursive https://github.com/jts/nanopolish.git
+    cd nanopolish
+    git checkout v0.7.1
+    make
+
+To run using docker
+-------------------
+
+First build the image from the dockerfile: ::
+
+    docker build .
+
+Note the uuid given upon successful build. Then you can run nanopolish from 
the image: ::
+
+    docker run -v /path/to/local/data/data/:/data/ -it :image_id  ./nanopolish 
eventalign -r /data/reads.fa -b /data/alignments.sorted.bam -g /data/ref.fa
diff --git a/docs/source/manual.rst b/docs/source/manual.rst
new file mode 100644
index 0000000..844fae7
--- /dev/null
+++ b/docs/source/manual.rst
@@ -0,0 +1,413 @@
+.. _manual:
+
+Manual
+===================
+
+Modules available: ::
+
+    nanopolish extract: extract reads in FASTA or FASTQ format from a 
directory of FAST5 files
+    nanopolish call-methylation: predict genomic bases that may be methylated
+    nanopolish variants: detect SNPs and indels with respect to a reference 
genome
+    nanopolish variants --consensus: calculate an improved consensus sequence 
for a draft genome assembly
+    nanopolish eventalign: align signal-level events to k-mers of a reference 
genome
+
+|
+
+extract
+--------------------
+
+Overview
+"""""""""""""""""""""""
+
+This module is used to extract reads in FASTA or FASTQ format from a directory 
of FAST5 files.  
+
+Input
+"""""""""""""""""""""""
+
+    * path to a directory of FAST5 files modified to contain basecall 
information
+
+Output
+"""""""""""""""""""""""
+
+    * sequences of reads in FASTA or FASTQ format
+
+Usage example
+"""""""""""""""""""""""
+
+::
+
+   nanopolish extract [OPTIONS] <fast5|dir>
+
+.. list-table:: 
+   :widths: 20 10 20 50
+   :header-rows: 1
+
+   * - Argument name(s)
+     - Required
+     - Default value
+     - Description
+
+   * -  <fast5|dir>
+     - Y
+     - NA
+     - FAST5 or path to directory of FAST5 files.
+
+   * - ``-r``, ``--recurse``
+     - N
+     - NA
+     - Recurse into subdirectories
+
+   * - ``-q``, ``--fastq``
+     - N
+     - fasta format
+     - Use when you want to extract to FASTQ format
+
+   * - ``-t``, ``--type=TYPE``
+     - N
+     - 2d-or-template
+     - The type of read either: {template, complement, 2d, 2d-or-template, any}
+
+   * - ``-b``, ``--basecaller=NAME[:VERSION]``
+     - N
+     - NA
+     - consider only data produced by basecaller NAME, optionally with given 
exact VERSION
+
+   * - ``-o``, ``--output=FILE``
+     - N
+     - stdout
+     - Write output to FILE
+
+|
+
+index
+--------------------
+
+Overview
+"""""""""""""""""""""""
+Build an index mapping from basecalled reads to the signals measured by the 
sequencer
+
+Input
+""""""""
+    * path to directory of raw nanopore sequencing data in FAST5 format
+    * basecalled reads
+
+Output
+""""""""
+    * gzipped FASTA format of basecalled reads
+    * index files (fai, gzi, readdb)
+
+Readdb file format
+""""""""""""""""""""
+Readdb file is a tab-separated file that contains two columns. One column 
represents read ids and the other column represents the corresponding path to 
FAST5 file: ::
+
+    read_id_1   /path/to/fast5/containing/reads_id_1/signals
+    read_id_2   /path/to/fast5/containing/read_id_2/signals
+
+Usage example
+""""""""""""""
+::
+
+    nanopolish index [OPTIONS] -d nanopore_raw_file_directory reads.fastq
+
+.. list-table::
+   :widths: 20 10 20 50
+   :header-rows: 1
+
+   * - Argument name(s)
+     - Required
+     - Default value
+     - Description
+
+   * - ``-d``, ``--directory``
+     - Y
+     - NA
+     - FAST5 or path to directory of FAST5 files containing ONT sequencing raw 
signal information.
+
+   * - ``-f``, ``--fast5-fofn``
+     - N
+     - NA
+     - file containing the paths to each fast5 for the run
+
+
+
+call-methylation
+--------------------
+
+Overview
+"""""""""""""""""""""""
+
+Classify nucleotides as methylated or not.
+
+Input
+"""""""""""""""""""""""
+
+    * Basecalled ONT reads in FASTA format
+
+Output
+"""""""""""""""""""""""
+
+    * tab-separated file containing per-read log-likelihood ratios
+
+Usage example
+"""""""""""""""""""""""
+
+::
+
+   nanopolish call-methylation [OPTIONS] <fast5|dir>
+
+.. list-table::
+   :widths: 20 10 20 50
+   :header-rows: 1
+
+   * - Argument name(s)
+     - Required
+     - Default value
+     - Description
+
+   * - ``-r``, ``--reads=FILE``
+     - Y
+     - NA
+     - the 2D ONT reads are in fasta FILE
+
+   * - ``-b``, ``--bam=FILE``
+     - Y
+     - NA 
+     - the reads aligned to the genome assembly are in bam FILE
+
+   * - ``-g``, ``--genome=FILE``
+     - Y
+     - NA 
+     - the genome we are computing a consensus for is in FILE
+
+   * - ``-t``, ``--threads=NUM``
+     - N
+     - 1
+     - use NUM threads
+
+   * - ``--progress``
+     - N
+     - NA
+     - print out a progress message
+
+variants
+--------------------
+
+Overview
+"""""""""""""""""""""""
+
+This module is used to call single nucleotide polymorphisms (SNPs) using a 
signal-level HMM.  
+
+Input
+"""""""""""""""""""""""
+
+    * basecalled reads
+    * alignment info
+    * genome assembly
+
+Output
+"""""""""""""""""""
+
+    * VCF file
+
+Usage example
+"""""""""""""""""""""""
+
+::
+
+   nanopolish variants [OPTIONS] --reads reads.fa --bam alignments.bam 
--genome genome.fa
+
+.. list-table::
+   :widths: 20 10 20 50
+   :header-rows: 1
+
+   * - Argument name(s)
+     - Required
+     - Default value
+     - Description
+
+   * - ``--snps``
+     - N
+     - NA
+     - use flag to only call SNPs
+
+   * - ``--consensus=FILE``
+     - N
+     - NA
+     - run in consensus calling mode and write polished sequence to FILE
+
+   * - ``--fix-homopolymers``
+     - N
+     - NA
+     - use flag to run the experimental homopolymer caller
+
+   * - ``--faster``
+     - N
+     - NA
+     - minimize compute time while slightly reducing consensus accuracy
+
+   * - ``-w``, ``--window=STR``
+     - N
+     - NA
+     - find variants in window STR (format: <chromsome_name>:<start>-<end>)
+
+   * - ``-r``, ``--reads=FILE``
+     - Y
+     - NA
+     - the 2D ONT reads are in fasta FILE
+
+   * - ``-b``, ``--bam=FILE``
+     - Y
+     - NA
+     - the reads aligned to the reference genome are in bam FILE 
+
+   * - ``-e``, ``--event-bam=FILE``
+     - Y
+     - NA
+     - the events aligned to the reference genome are in bam FILE
+
+   * - ``-g``, ``--genome=FILE``
+     - Y
+     - NA
+     - the reference genome is in FILE
+
+   * - ``-o``, ``--outfile=FILE``
+     - N
+     - stdout
+     - write result to FILE
+
+   * - ``-t``, ``--threads=NUM``
+     - N
+     - 1
+     - use NUM threads
+
+   * - ``-m``, ``--min-candidate-frequency=F``
+     - N
+     - 0.2
+     - extract candidate variants from the aligned reads when the variant 
frequency is at least F
+
+   * - ``-d``, ``--min-candidate-depth=D``
+     - N
+     - 20
+     - extract candidate variants from the aligned reads when the depth is at 
least D
+
+   * - ``-x``, ``--max-haplotypes=N``
+     - N
+     - 1000
+     - consider at most N haplotypes combinations
+
+   * - ``--max-rounds=N``
+     - N
+     - 50
+     - perform N rounds of consensus sequence improvement
+
+   * - ``-c``, ``--candidates=VCF``
+     - N
+     - NA
+     - read variants candidates from VCF, rather than discovering them from 
aligned reads
+
+   * - ``-a``, ``--alternative-basecalls-bam=FILE``
+     - N
+     - NA
+     - if an alternative basecaller was used that does not output event 
annotations then use basecalled sequences from FILE. The signal-level events 
will still be taken from the -b bam
+
+   * - ``--calculate-all-support``
+     - N
+     - NA
+     - when making a call, also calculate the support of the 3 other possible 
bases
+
+   * - ``--models-fofn=FILE``
+     - N
+     - NA
+     - read alternatives k-mer models from FILE
+
+
+event align
+--------------------
+
+Overview
+"""""""""""""""""""""""
+
+Align nanopore events to reference k-mers
+
+Input
+"""""""""""""""""""""""
+
+    * basecalled reads
+    * alignment information
+    * assembled genome
+
+Usage example
+"""""""""""""""""""""""
+
+::
+
+   nanopolish eventalign [OPTIONS] --reads reads.fa --bam alignments.bam 
--genome genome.fa
+
+.. list-table::
+   :widths: 20 10 20 50
+   :header-rows: 1
+
+   * - Argument name(s)
+     - Required
+     - Default value
+     - Description
+
+   * - ``--sam``
+     - N
+     - NA
+     - use to write output in SAM format
+
+   * - ``-w, --window=STR``
+     - N
+     - NA
+     - Compute the consensus for window STR (format : ctg:start_id-end_id)
+
+   * - ``-r, --reads=FILE``
+     - Y
+     - NA
+     - the 2D ONT reads are in fasta FILE
+
+   * - ``-b, --bam=FILE``
+     - Y
+     - NA
+     - the reads aligned to the genome assembly are in bam FILE
+
+   * - ``-g, --genome=FILE``
+     - Y
+     - NA
+     - the genome we are computing a consensus for is in FILE
+
+   * - ``-t, --threads=NUM``
+     - N
+     - 1
+     - use NUM threads
+
+   * - ``--scale-events``
+     - N
+     - NA
+     - scale events to the model, rather than vice-versa
+
+   * - ``--progress``
+     - N
+     - NA
+     - print out a progress message
+
+   * - ``-n``, ``--print-read-names``
+     - N
+     - NA
+     - print read names instead of indexes
+
+   * - ``--summary=FILE``
+     - N
+     - NA
+     - summarize the alignment of each read/strand in FILE
+
+   * - ``--samples``
+     - N
+     - NA
+     - write the raw samples for the event to the tsv output
+
+   * - ``--models-fofn=FILE``
+     - N
+     - NA
+     - read alternative k-mer models from FILE
diff --git a/docs/source/quickstart_call_methylation.rst 
b/docs/source/quickstart_call_methylation.rst
new file mode 100644
index 0000000..40316e7
--- /dev/null
+++ b/docs/source/quickstart_call_methylation.rst
@@ -0,0 +1,125 @@
+.. _quickstart_call_methylation:
+
+Quickstart - calling methylation with nanopolish
+=====================================================
+
+Oxford Nanopore sequencers are sensitive to base modifications. Here we 
provide a step-by-step tutorial to help you get started with detecting base 
modifications using nanopolish.
+
+**For more information about our approach:**
+
+* Simpson, Jared T., et al. `"Detecting DNA cytosine methylation using 
nanopore sequencing." <https://www.nature.com/articles/nmeth.4184>`_ Nature 
Methods (2017). 
+
+**Requirements**:
+
+* `nanopolish v0.8.4 <installation.html>`_
+* `samtools v1.2 <http://samtools.sourceforge.net/>`_
+* `minimap2 <https://github.com/lh3/minimap2>`_
+
+Download example dataset
+------------------------------------
+
+In this tutorial we will use a subset of the `NA12878 WGS Consortium data 
<https://github.com/nanopore-wgs-consortium/NA12878/blob/master/Genome.md>`_. 
You can download the example dataset we will use here (warning: the file is 
about 2GB): ::
+
+    wget http://s3.climb.ac.uk/nanopolish_tutorial/methylation_example.tar.gz
+    tar -xvf methylation_example.tar.gz
+    cd methylation_example
+
+**Details**:
+
+* Sample :     Human cell line (NA12878)
+* Basecaller : Albacore v2.0.2
+* Region: chr20:5,000,000-10,000,000
+
+In the extracted example data you should find the following files:
+
+* ``albacore_output.fastq`` : the subset of the basecalled reads
+* ``reference.fasta`` : the chromsome 20 reference sequence
+* ``fast5_files/`` : a directory containing signal-level FAST5 files
+
+The reads were basecalled using this albacore command: ::
+
+    read_fast5_basecaller.py -c r94_450bps_linear.cfg -t 8 -i fast5_files -s 
basecalled/ -o fastq
+
+After the basecaller finished, we merged all of the fastq files together into 
a single file: ::
+
+    cat basecalled/workspace/pass/*.fastq > albacore_output.fastq
+
+Data preprocessing
+------------------------------------
+
+nanopolish needs access to the signal-level data measured by the nanopore 
sequencer. To begin, we need to create an index file that links read ids with 
their signal-level data in the FAST5 files: ::
+
+    nanopolish index -d fast5_files/ albacore_output.fastq
+
+We get the following files: ``albacore_output.fastq.index``, 
``albacore_output.fastq.index.fai``, ``albacore_output.fastq.index.gzi``, and 
``albacore_output.fastq.index.readdb``.
+
+Aligning reads to the reference genome
+--------------------------------------
+
+Next, we need to align the basecalled reads to the reference genome. We use 
minimap2 as it is fast enough to map reads to the human genome. In this example 
we'll pipe the output directly into ``samtools sort`` to get a sorted bam file: 
::
+
+    minimap2 -a -x map-ont reference.fasta albacore_output.fastq | samtools 
sort -T tmp -o albacore_output.sorted.bam
+    samtools index albacore_output.sorted.bam
+
+Calling methylation
+-------------------
+
+Now we're ready to use nanopolish to detect methylated bases (in this case 
5-methylcytosine in a CpG context). The command is fairly straightforward - we 
have to tell it what reads to use (``albacore_output.fastq``), where the 
alignments are (``albacore_output.sorted.bam``), the reference genome 
(``reference.fasta``) and what region of the genome we're interested in 
(``chr20:5,000,000-10,000,000``)::
+       
+    nanopolish call-methylation -t 8 -r albacore_output.fastq -b 
albacore_output.sorted.bam -g reference.fasta -w "chr20:5,000,000-10,000,000" > 
methylation_calls.tsv
+
+The output file contains a lot of information including the position of the CG 
dinucleotide on the reference genome, the ID of the read that was used to make 
the call, and the log-likelihood ratio calculated by our model: ::
+
+       chromosome  start    end      read_name                             
log_lik_ratio  log_lik_methylated  log_lik_unmethylated  num_calling_strands  
num_cpgs  sequence
+       chr20       4980553  4980553  c1e202f4-e8f9-4eb8-b9a6-d79e6fab1e9a  
3.70           -167.47             -171.17               1                    1 
        TGAGACGGGGT
+       chr20       4980599  4980599  c1e202f4-e8f9-4eb8-b9a6-d79e6fab1e9a  
2.64           -98.87              -101.51               1                    1 
        AATCTCGGCTC
+       chr20       4980616  4980616  c1e202f4-e8f9-4eb8-b9a6-d79e6fab1e9a  
-0.61          -95.35              -94.75                1                    1 
        ACCTCCGCCTC
+       chr20       4980690  4980690  c1e202f4-e8f9-4eb8-b9a6-d79e6fab1e9a  
-2.99          -99.58              -96.59                1                    1 
        ACACCCGGCTA
+       chr20       4980780  4980780  c1e202f4-e8f9-4eb8-b9a6-d79e6fab1e9a  
5.27           -135.45             -140.72               1                    1 
        CACCTCGGCCT
+       chr20       4980807  4980807  c1e202f4-e8f9-4eb8-b9a6-d79e6fab1e9a  
-2.95          -89.20              -86.26                1                    1 
        ATTACCGGTGT
+       chr20       4980820  4980822  c1e202f4-e8f9-4eb8-b9a6-d79e6fab1e9a  
7.47           -90.63              -98.10                1                    2 
        GCCACCGCGCCCA
+       chr20       4980899  4980901  c1e202f4-e8f9-4eb8-b9a6-d79e6fab1e9a  
3.17           -96.40              -99.57                1                    2 
        GTATACGCGTTCC
+       chr20       4980955  4980955  c1e202f4-e8f9-4eb8-b9a6-d79e6fab1e9a  
0.33           -92.14              -92.47                1                    1 
        AGTCCCGATAT
+
+
+A positive value in the ``log_lik_ratio`` column indicates support for 
methylation. We have provided a helper script that can be used to calculate how 
often each reference position was methylated: ::
+
+       scripts/calculate_methylation_frequency.py -i methylation_calls.tsv > 
methylation_frequency.tsv
+
+The output is another tab-separated file, this time summarized by genomic 
position: ::
+
+       chromosome  start    end      num_cpgs_in_group  called_sites  
called_sites_methylated  methylated_frequency  group_sequence
+       chr20       5036763  5036763  1                  21            20       
                0.952                 split-group
+       chr20       5036770  5036770  1                  21            20       
                0.952                 split-group
+       chr20       5036780  5036780  1                  21            20       
                0.952                 split-group
+       chr20       5037173  5037173  1                  13            5        
                0.385                 AAGGACGTTAT
+
+In the example data set we have also included bisulfite data from ENCODE for 
the same region of chromosome 20. We can use the included 
``compare_methylation.py`` helper script to do a quick comparison between the 
nanopolish methylation output and bisulfite: ::
+
+    python compare_methylation.py bisulfite.ENCFF835NTC.example.tsv 
methylation_frequency.tsv > bisulfite_vs_nanopolish.tsv
+
+We can use R to visualize the results - we observe good correlation between 
the nanopolish methylation calls and bisulfite: ::
+
+    library(ggplot2)
+    library(RColorBrewer)
+    data <- read.table("bisulfite_vs_nanopolish.tsv", header=T)
+
+    # Set color palette for 2D heatmap
+    rf <- colorRampPalette(rev(brewer.pal(11,'Spectral')))
+    r <- rf(32)
+
+    c <- cor(data$frequency_1, data$frequency_2)
+    title <- sprintf("N = %d r = %.3f", nrow(data), c)
+    ggplot(data, aes(frequency_1, frequency_2)) +
+        geom_bin2d(bins=25) + scale_fill_gradientn(colors=r, trans="log10") +
+        xlab("Bisulfite Methylation Frequency") +
+        ylab("Nanopolish Methylation Frequency") +
+        theme_bw(base_size=20) +
+        ggtitle(title)
+
+Here's what the output should look like:
+
+.. figure:: _static/quickstart_methylation_results.png
+  :scale: 80%
+  :alt: quickstart_methylation_results
+
diff --git a/docs/source/quickstart_consensus.rst 
b/docs/source/quickstart_consensus.rst
new file mode 100644
index 0000000..59985f0
--- /dev/null
+++ b/docs/source/quickstart_consensus.rst
@@ -0,0 +1,115 @@
+.. _quickstart_consensus:
+
+Quickstart - how to polish the consensus assembly
+===================================================
+
+The original purpose for nanopolish was to improve the consensus assembly 
accuracy for Oxford Nanopore Technology sequencing reads. Here we provide a 
step-by-step tutorial to help you get started with our tool.
+
+**Requirements**:
+
+* `nanopolish v0.8.4 <installation.html>`_
+* `samtools v1.2 <http://samtools.sourceforge.net/>`_
+* `bwa v0.7.12 <https://github.com/lh3/bwa>`_
+* `MUMmer <https://github.com/mummer4/mummer>`_
+
+Download example dataset
+------------------------------------
+
+You can download the example dataset we will use here: ::
+
+    wget http://s3.climb.ac.uk/nanopolish_tutorial/ecoli_2kb_region.tar.gz
+    tar -xvf ecoli_2kb_region.tar.gz
+    cd ecoli_2kb_region
+
+**Details**:
+
+* Sample :     E. coli str. K-12 substr. MG1655
+* Instrument : MinION sequencing R9.4 chemistry
+* Basecaller : Albacore v2.0.1
+* Region: "tig00000001:200000-202000"
+* Note: Ligation-mediated PCR amplification performed
+
+This is a subset of reads that aligned to a 2kb region in the E. coli draft 
assembly. To see how we generated these files please refer to the tutorial 
:ref:`creati
+ng_example_dataset <here>`.
+
+You should find the following files:
+
+* ``reads.fasta`` : subset of basecalled reads
+* ``draft.fa`` : draft genome assembly
+* ``draft.fa.fai`` : draft genome assembly index
+* ``fast5_files/`` : a directory containing FAST5 files
+* ``ecoli_2kb_region.log`` : a log file for how the dataset was created with 
nanopolish helper script (``scripts/extract_reads_aligned_to_region.py``) 
+
+For the evaluation step you will need the reference genome: ::
+
+    wget -O ref.fa 
ftp://ftp.ncbi.nih.gov/genomes/archive/old_genbank/Bacteria/Escherichia_coli_K_12_substr__MG1655_uid225/U00096.ffn
+
+Analysis workflow
+-------------------------------
+
+The pipeline below describes the recommended analysis workflow for larger 
datasets. In this tutorial, we will run through the basic steps of the pipeline 
for this smaller (2kb) dataset.
+
+.. figure:: _static/nanopolish-workflow.png
+  :scale: 90%
+  :alt: nanopolish-tutorial-workflow
+
+Data preprocessing
+------------------------------------
+
+nanopolish needs access to the signal-level data measured by the nanopore 
sequencer. To begin, we need to create an index ``readdb`` file that links read 
ids with their signal-level data in the FAST5 files: ::
+
+    nanopolish index -d fast5_files/ reads.fasta
+
+We get the following files: ``reads.fasta.index``, ``reads.fasta.index.fai``, 
``reads.fasta.index.gzi``, and ``reads.fasta.index.readdb``.
+
+Compute the draft genome assembly using canu
+-----------------------------------------------
+
+As computing the draft genome assembly takes a few hours we have included the 
pre-assembled data for you (``draft.fa``).
+We used the following parameters with `canu <canu.readthedocs.io>`_: ::
+
+    canu \
+        -p ecoli -d outdir genomeSize=4.6m \
+        -nanopore-raw albacore-2.0.1-merged.fastq \
+
+Compute a new consensus sequence for a draft assembly
+------------------------------------------------------------------------
+
+Now that we have ``reads.fasta`` indexed with ``nanopolish index``, and have a 
draft genome assembly ``draft.fa``, we can begin to improve the assembly with 
nanopolish. Let us get started! 
+
+First step, is to index the draft genome assembly. We can do that with the 
following command: ::
+
+    bwa index draft.fa
+
+Next, we align the original reads (``reads.fasta``) to the draft assembly 
(``draft.fa``) and sort alignments: ::
+
+    bwa mem -x ont2d -t 8 draft.fa reads.fasta | samtools sort -o 
reads.sorted.bam -T reads.tmp
+    samtools index reads.sorted.bam
+
+**Checkpoint**: we can do a quick check to see if this step worked. The bam 
file should not be empty. ::
+
+    samtools view reads.sorted.bam | head
+
+Then we run the consensus algorithm. For larger datasets we use 
``nanopolish_makerange.py`` to split the draft genome assembly into 50kb 
segments, so that we can run the consensus algorithm on each segment in 
parallel. The output would be the polished segments in ``fasta`` format. 
+Since our dataset is only covering a 2kb region, we skip this step and use the 
following command: ::
+
+    nanopolish variants --consensus polished.fa \
+        -w "tig00000001:200000-202000" \
+        -r reads.fasta \
+        -b reads.sorted.bam \
+        -g draft.fa
+
+We are left with our desired output: ``polished.fa``.
+
+Evaluate the assembly
+---------------------------------
+
+To analyze how nanopolish performed improving the accuracy we use `MUMmer 
<https://github.com/mummer4/mummer>`_. MUMmer contains "dnadiff", a program 
that enables us to see a report on alignment statistics. With dnadiff we can 
compare the two different assemblies. ::
+
+    mkdir analysis
+    MUMmer3.23/dnadiff --prefix analysis/draft.dnadiff ref.fa draft.fa
+    MUMmer3.23/dnadiff --prefix analysis/polished.dnadiff ref.fa polished.fa
+
+This generates ``draft.dnadiff.report`` and ``polished.dnadiff.report`` along 
with other files. The metric we are interested in is ``AvgIdentity`` under ``[ 
Alignments ] 1-to-1``, which is a measurement of how similar the genome 
assemblies are to the reference genome. We expect to see a higher value for the 
polished assembly than the draft ( ``99.90`` vs ``99.53`` ), concluding that 
the nanopolish consensus algorithm worked successfully.
+
+.. note:: The example dataset was PCR amplified causing a loss of methylation 
information. We recommend using the ``-q dam,dcm`` with ``nanopolish variants 
--consensus`` if you have data with methylation information to account for 
known bacterial methyltransferases.
diff --git a/docs/source/quickstart_eventalign.rst 
b/docs/source/quickstart_eventalign.rst
new file mode 100644
index 0000000..fe4dc02
--- /dev/null
+++ b/docs/source/quickstart_eventalign.rst
@@ -0,0 +1,103 @@
+.. _quickstart_eventalign:
+
+Quickstart - how to align events to a reference genome
+========================================================
+
+The eventalign module in nanopolish is used to align events or "squiggles" to 
a reference genome. We (the developers of nanopolish) use this feature 
extensively when we want to see what the low-level signal information looks 
like. It helps us model the signal and discover differences in current that 
might hint at base modifications. Here we provide a step-by-step tutorial to 
help you get started with the nanopolish eventalign module.
+
+**For more information about eventalign**:
+
+* `Blog post: "Aligning Nanopore Events to a Reference" 
<http://simpsonlab.github.io/2015/04/08/eventalign/>`_
+* `Paper: "A complete bacterial genome assembled de novo using only nanopore 
sequencing data" <https://www.nature.com/articles/nmeth.3444>`_
+
+**Requirements**:
+
+* `nanopolish v0.8.4 <installation.html>`_
+* `samtools v1.2 <http://samtools.sourceforge.net/>`_
+* `bwa v0.7.12 <https://github.com/lh3/bwa>`_
+
+Download example dataset
+------------------------------------
+
+You can download the example dataset we will use here: ::
+
+    wget http://s3.climb.ac.uk/nanopolish_tutorial/ecoli_2kb_region.tar.gz
+    tar -xvf ecoli_2kb_region.tar.gz
+    cd ecoli_2kb_region
+
+**Details**:
+
+* Sample :    E. coli str. K-12 substr. MG1655
+* Instrument : MinION sequencing R9.4 chemistry
+* Basecaller : Albacore v2.0.1
+* Region: "tig00000001:200000-202000"
+* Note: Ligation-mediated PCR amplification performed
+
+This is a subset of reads that aligned to a 2kb region in the E. coli draft 
assembly. To see how we generated these files please refer to this section: 
:ref:`creating_example_dataset`. 
+
+You should find the following files:
+
+* ``reads.fasta`` : subset of basecalled reads
+* ``fast5_files/`` : a directory containing FAST5 files
+
+You will need the E. coli reference genome: ::
+
+    wget -O ref.fa 
ftp://ftp.ncbi.nih.gov/genomes/archive/old_genbank/Bacteria/Escherichia_coli_K_12_substr__MG1655_uid225/U00096.ffn
+
+Align the reads with bwa
+--------------------------------
+
+In order to run bwa we first need to index the reference genome: ::
+
+    bwa index ref.fa
+
+**Output files**: ``ref.fa.sa``, ``ref.fa.amb``, ``ref.fa.ann``, 
``ref.fa.pac``, and ``ref.fa.bwt``.
+
+We will need to index the reads as well: ::
+
+    nanopolish index -d fast5_files/ reads.fasta
+
+**Output files**: ``reads.fasta.index``, ``reads.fasta.index.fai``, 
``reads.fasta.index.gzi``, and ``reads.fasta.index.readdb``.   
+
+Then we can align the reads to the reference: ::
+
+    bwa mem -x ont2d ref.fa reads.fasta | samtools sort -o 
reads-ref.sorted.bam -T reads.tmp
+    samtools index reads-ref.sorted.bam
+
+**Output files**: ``reads-ref.sorted.bam`` and ``reads-ref.sorted.bam.bai``.
+
+**Checkpoint**: Let's see if the bam file is not truncated. This will check 
that the beginning of the file contains a valid header, and checks if the EOF 
is present. This will exit with a non-zero exit code if the conditions were not 
met: ::
+
+    samtools quickcheck reads-ref.sorted.bam
+ 
+Align the nanopore events to a reference
+-----------------------------------------------
+
+Now we are ready to run nanopolish to align the events to the reference 
genome: ::
+
+    nanopolish eventalign \
+        --reads reads.fasta \
+        --bam reads-ref.sorted.bam \
+        --genome ref.fa \
+        --scale-events > reads-ref.eventalign.txt
+
+Assess the eventalign output
+-----------------------------------------------
+
+If we take a peek at the first few lines of ``reads-ref.eventalign.txt`` this 
is what we get: ::
+
+       contig    position  reference_kmer  read_index  strand  event_index  
event_level_mean  event_stdv  event_length  model_kmer  model_mean  model_stdv  
standardized_level
+       gi|545778205|gb|U00096.3|:c514859-514401  3         ATGGAG          0   
        t       16538        89.82             3.746       0.00100       CTCCAT 
     92.53       2.49        -0.88
+       gi|545778205|gb|U00096.3|:c514859-514401  3         ATGGAG          0   
        t       16537        88.89             2.185       0.00100       CTCCAT 
     92.53       2.49        -1.18
+       gi|545778205|gb|U00096.3|:c514859-514401  3         ATGGAG          0   
        t       16536        94.96             2.441       0.00125       CTCCAT 
     92.53       2.49        0.79
+       gi|545778205|gb|U00096.3|:c514859-514401  3         ATGGAG          0   
        t       16535        81.63             2.760       0.00150       NNNNNN 
     0.00        0.00        inf
+       gi|545778205|gb|U00096.3|:c514859-514401  7         AGTTAA          0   
        t       16534        78.96             2.278       0.00075       TTAACT 
     75.55       3.52        0.79
+       gi|545778205|gb|U00096.3|:c514859-514401  8         GTTAAT          0   
        t       16533        98.81             4.001       0.00100       ATTAAC 
     95.87       3.30        0.72
+       gi|545778205|gb|U00096.3|:c514859-514401  9         TTAATG          0   
        t       16532        96.92             1.506       0.00150       CATTAA 
     95.43       3.32        0.36
+       gi|545778205|gb|U00096.3|:c514859-514401  10        TAATGG          0   
        t       16531        70.86             0.402       0.00100       CCATTA 
     68.99       3.70        0.41
+       gi|545778205|gb|U00096.3|:c514859-514401  11        AATGGT          0   
        t       16530        91.24             4.256       0.00175       ACCATT 
     85.84       2.74        1.60
+
+Example plots
+-------------
+
+In `Figure 1 of our methylation detection paper 
<https://www.nature.com/articles/nmeth.4184>`_ we show a histogram of 
``event_level_mean`` for a selection of k-mers to demonstrate how methylation 
changes the observed current. The data for these figures was generated by 
eventalign, which we subsequently plotted in R using ggplot2.
diff --git a/scripts/calculate_methylation_frequency.py 
b/scripts/calculate_methylation_frequency.py
old mode 100644
new mode 100755
index 0be0870..b6b6371
--- a/scripts/calculate_methylation_frequency.py
+++ b/scripts/calculate_methylation_frequency.py
@@ -9,6 +9,10 @@ from collections import namedtuple
 def make_key(c, s, e):
     return c + ":" + str(s) + ":" + str(e)
 
+def split_key(k):
+    f = k.split(":")
+    return (f[0], int(f[1]), int(f[2]))
+
 class SiteStats:
     def __init__(self, g_size, g_seq):
         self.num_reads = 0
@@ -75,9 +79,10 @@ for record in csv_reader:
 # header
 print("\t".join(["chromosome", "start", "end", "num_cpgs_in_group", 
"called_sites", "called_sites_methylated", "methylated_frequency", 
"group_sequence"]))
 
-for key in sites:
+sorted_keys = sorted(sites.keys(), key = lambda x: split_key(x))
+
+for key in sorted_keys:
     if sites[key].called_sites > 0:
         (c, s, e) = key.split(":")
         f = float(sites[key].called_sites_methylated) / sites[key].called_sites
-        print("\t".join([str(x) for x in [c, s, e, sites[key].group_size, 
sites[key].called_sites, sites[key].called_sites_methylated, f, 
sites[key].sequence]]))
-
+        print("%s\t%s\t%s\t%d\t%d\t%d\t%.3f\t%s" % (c, s, e, 
sites[key].group_size, sites[key].called_sites, 
sites[key].called_sites_methylated, f, sites[key].sequence))
diff --git a/scripts/extract_reads_aligned_to_region.py 
b/scripts/extract_reads_aligned_to_region.py
new file mode 100755
index 0000000..9a66e56
--- /dev/null
+++ b/scripts/extract_reads_aligned_to_region.py
@@ -0,0 +1,309 @@
+#!/usr/bin/env python
+'''
+========================================================
+Extract info on reads that align to a given region 
+in draft genome assembly.
+========================================================
+'''
+try:
+       from Bio import SeqIO
+       import pysam
+       import argparse
+       import subprocess
+       import tarfile
+       import gzip
+       import sys,os
+except ImportError:
+       print('Missing package(s)')
+       quit()
+
+verbose = False
+log = list()
+
+def main():
+       # --------------------------------------------------------
+       # PART 0:       Parse input
+       # --------------------------------------------------------
+       parser = argparse.ArgumentParser(description='Extract and package reads 
within region')
+       parser.add_argument('-v', '--verbose', action="store_true", 
default=False, required=False, dest="verbose", help="Use for verbose output 
with info on progress.")
+       parser.add_argument('-b', '--bam', action="store", required=True, 
dest="bam", help="Sorted bam file created by aligning reads to the draft genome 
(refer to reads.sorted.bam in Nanopolish README).")
+       parser.add_argument('-r', '--reads', action="store", 
dest="fa_filename", help="Fasta, fastq, fasta.gz, or fastq.gz file (refer to 
reads.fa in Nanopolish README)")
+       parser.add_argument('-g', '--genome',  action="store", required=True, 
dest="draft_ga", help="Draft genome assembly (refer to draft.fa in Nanopolish 
README).")
+       parser.add_argument('-w', '--window', action="store", required=True, 
dest="draft_ga_coords", help="Draft genome assembly coordinates wrapped in 
quotes ex. \"tig000001:10000-20000\".")
+       parser.add_argument('-o', '--output_prefix', action="store", 
required=False, default="reads_subset", dest="output_prefix", help="Output 
prefix for tar.gz file and log file.")
+       args = parser.parse_args()
+       
+       # Check to see if user used verbose option
+       global verbose
+       if args.verbose:
+               verbose = True
+
+       # Infer readdb file from fasta/q file
+       readdb = args.fa_filename + ".index.readdb"
+
+       custom_print( "===================================================" )
+       custom_print( "Extract reads that align to given region" )
+       custom_print( "Package all necessary files to reproduce error" )
+       custom_print( "===================================================" )
+
+       # --------------------------------------------------------
+       # PART 1:       Validate input
+       # --------------------------------------------------------
+       custom_print( "[ Input ]" )
+       custom_print( "[+] Extracting from draft genome assembly coords: " + 
args.draft_ga_coords )
+       custom_print( "[+] BAM file (reads.fa aligned to draft.fa): " + 
args.bam )
+       custom_print( "[+] Readdb file: " + readdb )
+       custom_print( "[+] Draft genome assembly (draft.fa): " + args.draft_ga )
+       custom_print( "[+] FASTA/Q file (reads.fa): " + args.fa_filename )
+       custom_print( "[+] Output prefix: " + args.output_prefix ) 
+
+       custom_print( "[ Input check ]" )
+       files = list()
+       files.append(args.bam)
+       files.append(readdb)
+       files.append(args.fa_filename)
+       files.append(args.draft_ga)
+       draft_ga_fai = args.draft_ga + ".fai"
+       files.append(draft_ga_fai)
+
+       for i in files:
+               if not os.path.exists(i) or not os.path.getsize(i) > 0 or not 
os.access(i, os.R_OK):
+                       print( "Expecting " + i + ". But does not exist, is 
empty or is not readable." )
+                       sys.exit(1)
+
+       custom_print( "[ Validated input ] All input files exist, are 
not-empty, and are readable." )
+
+       # --------------------------------------------------------
+       # PART 2:       Reassign input argument values  
+       # --------------------------------------------------------
+       # o = old/original, ga = genome assembly, fa = fasta/q file
+       # coords = coordinates, op = output
+       o_bam = args.bam
+       o_readdb = readdb
+       o_fa = args.fa_filename
+       op = args.output_prefix
+       draft_ga_coords = args.draft_ga_coords
+
+       # --------------------------------------------------------
+       # PART 3:       With user input ref coords, extract all 
+       #               aligned reads within these coordinates, 
+       #               store read_ids, and fast5 files.
+       # --------------------------------------------------------
+       custom_print( "[ Extracting info on reads aligned to region ] \t" + 
draft_ga_coords )
+       samfile = pysam.AlignmentFile(o_bam, "rb")
+       region_read_ids = list()
+       region_num_reads = 0
+
+       # get all read ids of reads that are aligned to region in draft assembly
+       for read in samfile.fetch(region=draft_ga_coords):
+               id = read.query_name
+               # add to list if not already in list
+               if not id in region_read_ids:
+                       # store read id in list
+                       region_read_ids.append(id)
+                       # count number of reads that were aligned to the given 
region
+                       region_num_reads+=1
+
+       # --------------------------------------------------------
+       # PART 4:   Parse readdb file and find path to fast5 files
+       #               associated with each read that aligned to region
+       # --------------------------------------------------------
+       # readdb file has 2 columns: one indicating read_id and another 
indicating the fast5 file the read came from
+       # each row represents a read
+       custom_print( "[ Reading readdb file ]" )
+       region_fast5_files = dict()
+       with open (o_readdb, "r") as file:
+               for line in file:
+                       l = line.split("\t")
+                       read_id = l.pop(0)
+                       if read_id in region_read_ids:
+                               fast5_file = l.pop(0)
+                               region_fast5_files[str(read_id)] = 
fast5_file.rstrip()
+
+       # --------------------------------------------------------
+       # PART 5:   Make a region BAM and BAI file
+       # --------------------------------------------------------
+       new_bam = "reads.bam"
+       custom_print( "[ Writing to a new BAM file ] \t" + new_bam )
+       region_reads = pysam.view("-b", o_bam, draft_ga_coords, "-o", new_bam, 
catch_stdout=False)
+       
+       new_bam_index = new_bam + ".bai"
+       custom_print( "[ Writing to a new BAI file ] \t" + new_bam_index )
+       pysam.index(new_bam, new_bam_index)
+
+       # --------------------------------------------------------
+       # PART 6:       With user input ref coords, extract all 
+       #               aligned reads within these coordinates 
+       #               and make new FASTA file
+       # --------------------------------------------------------
+       # detect type of sequences file then handle accordingly
+       file_type = detect_fa_filetype(o_fa)
+       new_fa = "reads.fasta"
+       custom_print( "[ Writing to a new fasta file ]\t" +  new_fa )
+       with open (new_fa, "w") as fout:
+               if ".gz" in file_type:
+                       with gzip.open(o_fa, "rt") as fin:
+                               if "fasta.gz" in file_type:
+                                       for record in SeqIO.parse(fin, "fasta"):
+                                               if record.id in region_read_ids:
+                                                       fout.write(">" + 
record.id + "\n")
+                                                       
fout.write(str(record.seq) + "\n")
+                               elif "fastq.gz" in file_type:
+                                       for record in SeqIO.parse(fin, "fastq"):
+                                               if record.id in region_read_ids:
+                                                       fout.write(">" + 
record.id + "\n")
+                                                       
fout.write(str(record.seq) + "\n")
+               else:
+                       with open(o_fa, "rt") as fin:
+                               if "fasta" in file_type:
+                                       for record in SeqIO.parse(fin, "fasta"):
+                                               if record.id in region_read_ids:
+                                                       fout.write(">" + 
record.id + "\n")
+                                                       
fout.write(str(record.seq) + "\n")
+                               elif "fastq" in file_type:
+                                       for record in SeqIO.parse(fin, "fastq"):
+                                               if record.id in region_read_ids:
+                                                       fout.write(">" + 
record.id + "\n")
+                                                       
fout.write(str(record.seq) + "\n")
+
+       # --------------------------------------------------------
+       # PART 7:       Let's get to tarring
+       # --------------------------------------------------------
+       # While tarring, we need to fix the directory structure
+       # such that the original path to files are not saved.
+       # For each fast5 file we need to extract the basename,
+       # and save it in tar such that we save only the basename,
+       # and not the whole path from the original source.
+       tar_filename = op + ".tar.gz"
+       archive = tarfile.open(tar_filename, "w:gz")
+       custom_print( "[ Creating a tar.gz file ] \t" + tar_filename )
+       custom_print( "[+] FAST5 files: " + op + "/fast5_files/<FAST5 file(s)>" 
)
+       for r in region_fast5_files.keys():
+               read_id = r
+               f5 = region_fast5_files[r]
+
+               # get basename of fast5 file
+               f5_basename = extract_basename(f5)
+               an = op + "/fast5_files/" + f5_basename
+               archive.add(f5, arcname=an)
+
+       # --------------------------------------------------------
+       # PART 8:       Add new files to tar
+       #                       new fasta, new bam, and new bai with reads 
+       #                       in the region given only
+       # --------------------------------------------------------
+       an = op + "/" + new_fa
+       archive.add(new_fa, arcname=an)
+       custom_print( "[+] New FASTA: " + an )
+       
+       an_new_bam = op + "/" + new_bam
+       archive.add(new_bam, arcname=an_new_bam)
+       custom_print( "[+] New BAM: " + an_new_bam )
+
+       an_new_bam_index = op + "/" + new_bam_index
+       archive.add(new_bam_index, arcname=an_new_bam_index)
+       custom_print( "[+] New BAI: " + an_new_bam_index )
+
+       # --------------------------------------------------------
+       # PART 9:       Add original draft genome assembly file
+       #                       and the index file
+       # --------------------------------------------------------
+       an_draft_ga = op + "/draft.fa"
+       archive.add(args.draft_ga, arcname=an_draft_ga)
+       custom_print( "[+] Original draft ga: " + an_draft_ga )
+
+       an_draft_ga_fai = op + "/draft.fa.fai"
+       archive.add(i, arcname=an_draft_ga_fai)
+       custom_print( "[+] Original draft ga index: " + an_draft_ga_fai )
+
+       # --------------------------------------------------------
+       # PART 10:      Check the number of reads in all new files
+       # --------------------------------------------------------
+       custom_print( "[ Output check ] " )
+       # check the length of bam file
+       num_reads_bam = region_num_reads
+       num_reads_fasta = int(float(file_length(new_fa))/2.0)
+       num_fast5_files = len(region_fast5_files)
+       values = list()
+       values.append(num_reads_bam)
+       values.append(num_reads_fasta)
+       custom_print( "[+] Num reads in new BAM: \t" + str(num_reads_bam) )
+       custom_print( "[+] Num reads in new FASTA: \t" + str(num_reads_fasta) )
+       custom_print( "[+] Num files in fast5_files/: \t" + 
str(num_fast5_files))
+       if not all( v == num_fast5_files for v in values ):
+               print( "[!] WARNING: The number of reads in the new bam, new 
fasta, and num of fast5 files tarred are not equal..." )
+       else:
+               custom_print( "[ Validated output ] Number of reads in the new 
bam, new fasta, and num of fast5 files tarred are equal!" )
+
+       # --------------------------------------------------------
+       # FINAL:        Output log if verbose flag not used
+       # --------------------------------------------------------
+       global log
+       logfile = op + ".log"
+       with open (logfile, "w") as lfile:
+               for s in log:
+                       lfile.write(s + "\n")
+       an_logfile = op + "/" + logfile
+       custom_print( "[ Log file ] " +  an_logfile )
+       custom_print( "[ Tar file ] " + str(tar_filename) )
+       custom_print( "[ Finished ] " )
+       archive.add(logfile, arcname=an_logfile)
+       archive.close()
+
+def file_length(filename):
+       # ========================================================
+       # Returns number of lines in a file
+       # --------------------------------------------------------
+       # Input:        Filename
+       # Output:       Number of lines in the file ...
+       # ========================================================
+       with open(filename) as f:
+               for i, l in enumerate(f):
+                       pass
+       return int(i) + 1
+
+def extract_basename(filename):
+       # ========================================================
+       # Returns base filename
+       # --------------------------------------------------------
+       # Input:        Filenames with paths
+       # Output:       Base filename
+       # ========================================================
+       # remove backslashes at the end of the file names that could return 
empty basenames..
+       a = filename.rstrip("\\")
+       a = a.rstrip("//")
+       b = os.path.basename(a)
+       return str(b)
+
+def detect_fa_filetype(fa_filename):
+       # ========================================================
+       # Detects filetype of sequences input
+       # --------------------------------------------------------
+       # Input:        FASTA/Q filename
+       # Output:       Either ['fa.gz', 'fastq.gz', 'fasta.gz', 
+       #                       'fastq', 'fasta']
+       # ========================================================
+       path = fa_filename
+       if path.endswith('fa.gz'):
+               print("Possibly using the reads file generated by nanopolish 
index? Use original reads file...")        
+       for ext in ['fastq.gz', 'fasta.gz', 'fastq', 'fasta']:
+               if path.endswith(ext):
+                       return ext
+       print( "Must be either fasta, fastq, fasta.gz, fastq.gz" )
+       sys.exit(1)
+
+def custom_print(s):
+       # ========================================================
+       # Depending on verbose flag, will save all prints to 
+       # log list, or will print to stdout
+       # --------------------------------------------------------
+       # Input:        string to print
+       # ========================================================
+       global verbose
+       global log
+       if verbose:
+               print s
+       log.append(s)
+
+if __name__ == "__main__":
+       main()
diff --git a/scripts/nanopolish_merge.py b/scripts/nanopolish_merge.py
index e08bb79..ea059a3 100644
--- a/scripts/nanopolish_merge.py
+++ b/scripts/nanopolish_merge.py
@@ -25,7 +25,7 @@ def merge_into_consensus(consensus, incoming, overlap_length):
 
     assert(len(aln_con) == len(aln_inc))
 
-    for i in xrange(0, len(aln_con) / 2):
+    for i in range(0, len(aln_con) // 2):
         a = aln_con[i]
         b = aln_inc[i]
 
@@ -74,24 +74,29 @@ for fn in sys.argv[1:]:
             segments_by_name[contig] = dict()
 
         segment_start, segment_end = segment_range.split("-")
-
-        sys.stderr.write('Insert %s %s\n' % (contig, segment_start))
         segments_by_name[contig][int(segment_start)] = str(rec.seq)
 
 # Assemble while making sure every segment is present
 for contig_name in sorted(segments_by_name.keys()):
     assembly = ""
     prev_segment = None
+    all_segments_found = True
     for segment_start in sorted(segments_by_name[contig_name]):
 
-        sys.stderr.write('Merging %s %d\n' % (contig_name, segment_start))
         # Ensure the segments overlap
-        assert(prev_segment is None or prev_segment + SEGMENT_LENGTH + 
OVERLAP_LENGTH > segment_start)
+        if not( prev_segment is None or prev_segment + SEGMENT_LENGTH + 
OVERLAP_LENGTH > segment_start ):
+            sys.stderr.write("error: segment starting at %s:%d is missing\n" % 
(contig, prev_segment + SEGMENT_LENGTH + 40))
+            all_segments_found = False
 
         sequence = segments_by_name[contig_name][segment_start]
 
         assembly = merge_into_consensus(assembly, sequence, OVERLAP_LENGTH)
         prev_segment = segment_start
 
+
     # Write final assembly
-    print(">%s\n%s" % (contig_name, assembly))
+    if all_segments_found:
+        print(">%s\n%s" % (contig_name, assembly))
+    else:
+        sys.stderr.write("error: some segments are missing, could not merge 
contig %s\n" % (contig))
+        sys.exit(1)
diff --git a/src/alignment/nanopolish_alignment_db.cpp 
b/src/alignment/nanopolish_alignment_db.cpp
index ec8e6ce..357aed1 100644
--- a/src/alignment/nanopolish_alignment_db.cpp
+++ b/src/alignment/nanopolish_alignment_db.cpp
@@ -73,6 +73,7 @@ EventAlignmentRecord::EventAlignmentRecord(SquiggleRead* sr,
         size_t kmer_pos_ref_strand = seq_record.aligned_bases[i].read_pos;
         size_t kmer_pos_read_strand = seq_record.rc ? 
this->sr->flip_k_strand(kmer_pos_ref_strand, k) : kmer_pos_ref_strand;
         size_t event_idx = 
this->sr->get_closest_event_to(kmer_pos_read_strand, strand_idx);
+        assert(event_idx != -1);
         this->aligned_events.push_back( { seq_record.aligned_bases[i].ref_pos, 
(int)event_idx });
     }
     this->rc = strand_idx == 0 ? seq_record.rc : !seq_record.rc;
diff --git a/src/alignment/nanopolish_eventalign.cpp 
b/src/alignment/nanopolish_eventalign.cpp
index 1d7446d..b7eb14a 100644
--- a/src/alignment/nanopolish_eventalign.cpp
+++ b/src/alignment/nanopolish_eventalign.cpp
@@ -31,11 +31,15 @@
 #include "nanopolish_read_db.h"
 #include "nanopolish_hmm_input_sequence.h"
 #include "nanopolish_pore_model_set.h"
+#include "nanopolish_bam_processor.h"
 #include "H5pubconf.h"
 #include "profiler.h"
 #include "progress.h"
 
 //
+using namespace std::placeholders;
+
+//
 // Getopt
 //
 #define SUBPROGRAM "eventalign"
@@ -508,11 +512,11 @@ EventalignSummary summarize_alignment(const SquiggleRead& 
sr,
 }
 
 // Realign the read in event space
-void realign_read(EventalignWriter writer,
-                  const ReadDB& read_db, 
-                  const faidx_t* fai, 
-                  const bam_hdr_t* hdr, 
-                  const bam1_t* record, 
+void realign_read(const ReadDB& read_db,
+                  const faidx_t* fai,
+                  const EventalignWriter& writer,
+                  const bam_hdr_t* hdr,
+                  const bam1_t* record,
                   size_t read_idx,
                   int region_start,
                   int region_end)
@@ -524,12 +528,12 @@ void realign_read(EventalignWriter writer,
     SquiggleRead sr(read_name, read_db, opt::write_samples ? 
SRF_LOAD_RAW_SAMPLES : 0);
 
     if(opt::verbose > 1) {
-        fprintf(stderr, "Realigning %s [%zu %zu]\n", 
+        fprintf(stderr, "Realigning %s [%zu %zu]\n",
                 read_name.c_str(), sr.events[0].size(), sr.events[1].size());
     }
-    
+
     for(int strand_idx = 0; strand_idx < 2; ++strand_idx) {
-        
+
         // Do not align this strand if it was not sequenced
         if(!sr.has_events_for_strand(strand_idx)) {
             continue;
@@ -603,8 +607,10 @@ std::vector<EventAlignment> align_read_to_ref(const 
EventAlignmentParameters& pa
     ref_seq = pore_model->pmalphabet->disambiguate(ref_seq);
     std::string rc_ref_seq = 
pore_model->pmalphabet->reverse_complement(ref_seq);
 
-    if(ref_offset == 0)
+    // Skip unmapped
+    if((params.record->core.flag & BAM_FUNMAP) != 0) {
         return alignment_output;
+    }
 
     // Get the read-to-reference aligned segments
     std::vector<AlignedSegment> aligned_segments = 
get_aligned_segments(params.record);
@@ -620,8 +626,9 @@ std::vector<EventAlignment> align_read_to_ref(const 
EventAlignmentParameters& pa
         int max_kmer_idx = params.sr->read_sequence.size() - k;
         trim_aligned_pairs_to_kmer(aligned_pairs, max_kmer_idx);
 
-        if(aligned_pairs.empty())
+        if(aligned_pairs.empty()) {
             return alignment_output;
+        }
 
         bool do_base_rc = bam_is_rev(params.record);
         bool rc_flags[2] = { do_base_rc, !do_base_rc }; // indexed by strand
@@ -672,8 +679,9 @@ std::vector<EventAlignment> align_read_to_ref(const 
EventAlignmentParameters& pa
             HMMInputSequence hmm_sequence(fwd_subseq, rc_subseq, 
pore_model->pmalphabet);
             
             // Require a minimum amount of sequence to align to
-            if(hmm_sequence.length() < 2 * k)
+            if(hmm_sequence.length() < 2 * k) {
                 break;
+            }
 
             // Set up HMM input
             HMMInputData input;
@@ -862,42 +870,10 @@ int eventalign_main(int argc, char** argv)
 
     ReadDB read_db;
     read_db.load(opt::reads_file);
-    
-    // Open the BAM and iterate over reads
-
-    // load bam file
-    htsFile* bam_fh = sam_open(opt::bam_file.c_str(), "r");
-    assert(bam_fh != NULL);
 
-    // load bam index file
-    std::string index_filename = opt::bam_file + ".bai";
-    hts_idx_t* bam_idx = bam_index_load(index_filename.c_str());
-    if(bam_idx == NULL) {
-        bam_index_error_exit(opt::bam_file);
-    }
-
-    // read the bam header
-    bam_hdr_t* hdr = sam_hdr_read(bam_fh);
-    
     // load reference fai file
     faidx_t *fai = fai_load(opt::genome_file.c_str());
 
-    hts_itr_t* itr;
-
-    // If processing a region of the genome, only emit events aligned to this 
window
-    int clip_start = -1;
-    int clip_end = -1;
-
-    if(opt::region.empty()) {
-        // TODO: is this valid?
-        itr = sam_itr_queryi(bam_idx, HTS_IDX_START, 0, 0);
-    } else {
-
-        fprintf(stderr, "Region: %s\n", opt::region.c_str());
-        itr = sam_itr_querys(bam_idx, hdr, opt::region.c_str());
-        hts_parse_reg(opt::region.c_str(), &clip_start, &clip_end);
-    }
-
 #ifndef H5_HAVE_THREADSAFE
     if(opt::num_threads > 1) {
         fprintf(stderr, "You enabled multi-threading but you do not have a 
threadsafe HDF5\n");
@@ -909,72 +885,30 @@ int eventalign_main(int argc, char** argv)
     // Initialize output
     EventalignWriter writer = { NULL, NULL, NULL };
 
-    if(opt::output_sam) {
-        writer.sam_fp = hts_open("-", "w");
-        emit_sam_header(writer.sam_fp, hdr);
-    } else {
-        writer.tsv_fp = stdout;
-        emit_tsv_header(writer.tsv_fp);
-    }
-
     if(!opt::summary_file.empty()) {
         writer.summary_fp = fopen(opt::summary_file.c_str(), "w");
         // header
         fprintf(writer.summary_fp, 
"read_index\tread_name\tfast5_path\tmodel_name\tstrand\tnum_events\t");
         fprintf(writer.summary_fp, 
"num_steps\tnum_skips\tnum_stays\ttotal_duration\tshift\tscale\tdrift\tvar\n");
     }
-    
-    // Initialize iteration
-    std::vector<bam1_t*> records(opt::batch_size, NULL);
-    for(size_t i = 0; i < records.size(); ++i) {
-        records[i] = bam_init1();
-    }
 
-    int result;
-    size_t num_reads_realigned = 0;
-    size_t num_records_buffered = 0;
-    Progress progress("[eventalign]");
+    // the BamProcessor framework calls the input function with the
+    // bam record, read index, etc passed as parameters
+    // bind the other parameters the worker function needs here
+    auto f = std::bind(realign_read, std::ref(read_db), std::ref(fai), 
std::ref(writer), _1, _2, _3, _4, _5);
+    BamProcessor processor(opt::bam_file, opt::region, opt::num_threads);
 
-    do {
-        assert(num_records_buffered < records.size());
-        
-        // read a record into the next slot in the buffer
-        result = sam_itr_next(bam_fh, itr, records[num_records_buffered]);
-        
-        num_records_buffered += result >= 0;
-        // realign if we've hit the max buffer size or reached the end of file
-        if(num_records_buffered == records.size() || result < 0) {
-            #pragma omp parallel for            
-            for(size_t i = 0; i < num_records_buffered; ++i) {
-                bam1_t* record = records[i];
-                size_t read_idx = num_reads_realigned + i;
-                if( (record->core.flag & BAM_FUNMAP) == 0) {
-                    realign_read(writer, read_db, fai, hdr, record, read_idx, 
clip_start, clip_end);
-                }
-            }
-
-            num_reads_realigned += num_records_buffered;
-            num_records_buffered = 0;
-        }
-
-        if(opt::progress) {
-            fprintf(stderr, "Realigned %zu reads in %.1lfs\r", 
num_reads_realigned, progress.get_elapsed_seconds());
-        }
-    } while(result >= 0);
- 
-    assert(num_records_buffered == 0);
-
-    // cleanup records
-    for(size_t i = 0; i < records.size(); ++i) {
-        bam_destroy1(records[i]);
+    // Copy the bam header to std
+    if(opt::output_sam) {
+        writer.sam_fp = hts_open("-", "w");
+        emit_sam_header(writer.sam_fp, processor.get_bam_header());
+    } else {
+        writer.tsv_fp = stdout;
+        emit_tsv_header(writer.tsv_fp);
     }
 
-    // cleanup
-    sam_itr_destroy(itr);
-    bam_hdr_destroy(hdr);
-    fai_destroy(fai);
-    sam_close(bam_fh);
-    hts_idx_destroy(bam_idx);
+    // run
+    processor.parallel_run(f);
 
     if(writer.sam_fp != NULL) {
         hts_close(writer.sam_fp);
@@ -983,5 +917,7 @@ int eventalign_main(int argc, char** argv)
     if(writer.summary_fp != NULL) {
         fclose(writer.summary_fp);
     }
+
+    fai_destroy(fai);
     return EXIT_SUCCESS;
 }
diff --git a/src/common/nanopolish_bam_processor.cpp 
b/src/common/nanopolish_bam_processor.cpp
index cbec00e..d851c8f 100644
--- a/src/common/nanopolish_bam_processor.cpp
+++ b/src/common/nanopolish_bam_processor.cpp
@@ -27,8 +27,7 @@ BamProcessor::BamProcessor(const std::string& bam_file,
     assert(m_bam_fh != NULL);
 
     // load bam index file
-    std::string index_filename = m_bam_file + ".bai";
-    m_bam_idx = bam_index_load(index_filename.c_str());
+    m_bam_idx = sam_index_load(m_bam_fh, bam_file.c_str());
     if(m_bam_idx == NULL) {
         bam_index_error_exit(m_bam_file);
     }
diff --git a/src/common/nanopolish_common.h b/src/common/nanopolish_common.h
index d782230..feb20ed 100644
--- a/src/common/nanopolish_common.h
+++ b/src/common/nanopolish_common.h
@@ -18,7 +18,7 @@
 #include "logsum.h"
 
 #define PACKAGE_NAME "nanopolish"
-#define PACKAGE_VERSION "0.8.4"
+#define PACKAGE_VERSION "0.8.5"
 #define PACKAGE_BUGREPORT "https://github.com/jts/nanopolish/issues";
 
 //
diff --git a/src/main/nanopolish.cpp b/src/main/nanopolish.cpp
index f164d73..5dc9fb9 100644
--- a/src/main/nanopolish.cpp
+++ b/src/main/nanopolish.cpp
@@ -62,6 +62,9 @@ int print_version(int, char **)
 
 int main(int argc, char** argv)
 {
+    // Turn off HDF's exception printing, which is generally unhelpful for 
users
+    hdf5::H5Eset_auto(0, NULL, NULL);
+
     int ret = 0;
     if(argc <= 1) {
         printf("error: no command provided\n");
@@ -76,15 +79,17 @@ int main(int argc, char** argv)
             ret = print_usage( argc - 1, argv + 1);
     }
 
+
     // Emit a warning when some reads had to be skipped
     extern int g_total_reads;
     extern int g_unparseable_reads;
     extern int g_qc_fail_reads;
     extern int g_failed_calibration_reads;
     extern int g_failed_alignment_reads;
+    extern int g_bad_fast5_file;
     if(g_total_reads > 0) {
-        fprintf(stderr, "[post-run summary] total reads: %d unparseable: %d qc 
fail: %d could not calibrate: %d no alignment: %d\n", 
-            g_total_reads, g_unparseable_reads, g_qc_fail_reads, 
g_failed_calibration_reads, g_failed_alignment_reads);
+        fprintf(stderr, "[post-run summary] total reads: %d, unparseable: %d, 
qc fail: %d, could not calibrate: %d, no alignment: %d, bad fast5: %d\n", 
+            g_total_reads, g_unparseable_reads, g_qc_fail_reads, 
g_failed_calibration_reads, g_failed_alignment_reads, g_bad_fast5_file);
     }
     return ret;
 }
diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index 763265c..36214f8 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -84,7 +84,7 @@ static const char *CONSENSUS_USAGE_MESSAGE =
 "  -e, --event-bam=FILE                 the events aligned to the reference 
genome are in bam FILE\n"
 "  -g, --genome=FILE                    the reference genome is in FILE\n"
 "  -p, --ploidy=NUM                     the ploidy level of the sequenced 
genome\n"
-"  -q  --methylation-aware=STR          turn on methylation aware polishing 
and test motifs given in STR (example: -m dcm,dam)\n"
+"  -q  --methylation-aware=STR          turn on methylation aware polishing 
and test motifs given in STR (example: -q dcm,dam)\n"
 "      --genotype=FILE                  call genotypes for the variants in the 
vcf FILE\n"
 "  -o, --outfile=FILE                   write result to FILE [default: 
stdout]\n"
 "  -t, --threads=NUM                    use NUM threads (default: 1)\n"
@@ -283,14 +283,14 @@ std::vector<Variant> 
generate_candidate_single_base_edits(const AlignmentDB& ali
 
     // Add all positively-scoring single-base changes into the candidate set
     for(size_t i = region_start; i < region_end; ++i) {
-        
+
         int calling_start = i - opt::screen_flanking_sequence;
         int calling_end = i + 1 + opt::screen_flanking_sequence;
 
         if(!alignments.are_coordinates_valid(contig, calling_start, 
calling_end)) {
             continue;
         }
-        
+
         std::vector<Variant> tmp_variants;
         for(size_t j = 0; j < 4; ++j) {
             // Substitutions
diff --git a/src/nanopolish_extract.cpp b/src/nanopolish_extract.cpp
index e565865..2d4b1be 100644
--- a/src/nanopolish_extract.cpp
+++ b/src/nanopolish_extract.cpp
@@ -58,11 +58,18 @@ namespace opt
 }
 static std::ostream* os_p;
 
-std::vector< std::pair< unsigned, std::string > >
+struct ExtractionGroup
+{
+    fast5::Basecall_Group_Description bcd;
+    unsigned int strand_idx;
+    std::string basecall_group;
+};
+
+std::vector<ExtractionGroup>
 get_preferred_basecall_groups(const fast5::File& f)
 {
     bool have_2d = false;
-    std::vector< std::pair< unsigned, std::string > > res;
+    std::vector<ExtractionGroup> res;
     // check 2d
     if (opt::read_type == "any"
         or opt::read_type == "2d"
@@ -92,7 +99,7 @@ get_preferred_basecall_groups(const fast5::File& f)
                 and f.have_basecall_events(1, gr))
             {
                 have_2d = true;
-                res.push_back(std::make_pair(2, gr));
+                res.push_back( { bcd, 2, gr } );
                 if (opt::read_type != "any")
                 {
                     break;
@@ -100,6 +107,7 @@ get_preferred_basecall_groups(const fast5::File& f)
             }
         }
     }
+
     // check 1d
     for (unsigned st = 0; st < 2; ++st)
     {
@@ -131,7 +139,8 @@ get_preferred_basecall_groups(const fast5::File& f)
                 if (f.have_basecall_fastq(st, gr)
                     and f.have_basecall_events(st, gr))
                 {
-                    res.push_back(std::make_pair(st, gr));
+                    res.push_back( {bcd, st, gr} );
+
                     if (opt::read_type != "any")
                     {
                         break;
@@ -140,11 +149,12 @@ get_preferred_basecall_groups(const fast5::File& f)
             }
         }
     }
+
     LOG(debug)
         << "preferred_groups: "
         << alg::os_join(res, ", ", [] (const decltype(res)::value_type& p) {
                 std::ostringstream oss;
-                oss << "(" << p.first << ":" << p.second << ")";
+                oss << "(" << p.strand_idx << ":" << p.basecall_group << ")";
                 return oss.str();
             })
         << "\n";
@@ -175,28 +185,39 @@ void process_file(const std::string& fn)
                 LOG(info) << "file [" << fn << "]: no basecalling data 
suitable for nanoplish\n";
                 return;
             }
+
             ++opt::total_files_used_count;
             const auto& p = l.front();
             // get and parse fastq
-            auto fq = f.get_basecall_fastq(p.first, p.second);
+            auto fq = f.get_basecall_fastq(p.strand_idx, p.basecall_group);
             auto fq_a = f.split_fq(fq);
+                
+            SemVer basecaller_version = parse_semver_string(p.bcd.version);
+            bool is_albacore_2_read = p.bcd.name == "albacore" && 
basecaller_version.major == 2;
+
             // construct name
             std::string name;
             std::istringstream(fq_a[0]) >> name;
-            std::replace(name.begin(), name.end(), ':', '_');
-            name += ":" + p.second + ":";
-            if (p.first == 0)
-            {
-                name += "template";
-            }
-            else if (p.first == 1)
-            {
-                name += "complement";
-            }
-            else
-            {
-                name += "2d";
-            }
+            
+            // For older basecalled data appened the group information to the 
read name
+            // For albacore 2 data this isn't needed as nanopolish works 
directly from the raw
+            if(!is_albacore_2_read) {
+                std::replace(name.begin(), name.end(), ':', '_');
+                name += ":" + p.basecall_group + ":";
+                if (p.strand_idx == 0)
+                {
+                    name += "template";
+                }
+                else if (p.strand_idx == 1)
+                {
+                    name += "complement";
+                }
+                else
+                {
+                    name += "2d";
+                }
+            } 
+
             if (not opt::fastq)
             {
                 (*os_p)
@@ -388,12 +409,15 @@ int extract_main(int argc, char** argv)
     }
 
     // Build the ReadDB from the output file
-    ReadDB read_db;
-    read_db.build(opt::output_file);
-    read_db.save();
 
     std::clog << "[extract] found " << opt::total_files_count
               << " files, extracted " << opt::total_files_used_count
               << " reads\n";
+    
+    if(opt::total_files_used_count > 0) {
+        ReadDB read_db;
+        read_db.build(opt::output_file);
+        read_db.save();
+    } 
     return 0;
 }
diff --git a/src/nanopolish_index.cpp b/src/nanopolish_index.cpp
index 6ddc7ec..fe91dbd 100644
--- a/src/nanopolish_index.cpp
+++ b/src/nanopolish_index.cpp
@@ -54,11 +54,17 @@ static std::ostream* os_p;
 void index_file(ReadDB& read_db, const std::string& fn)
 {
     PROFILE_FUNC("index_file")
-    fast5::File* fp = new fast5::File(fn);
-    if(fp->is_open()) {
-        fast5::Raw_Samples_Params params = fp->get_raw_samples_params();
-        std::string read_id = params.read_id;
-        read_db.add_signal_path(read_id, fn);
+
+    fast5::File* fp = NULL;
+    try {
+        fp = new fast5::File(fn);
+        if(fp->is_open()) {
+            fast5::Raw_Samples_Params params = fp->get_raw_samples_params();
+            std::string read_id = params.read_id;
+            read_db.add_signal_path(read_id, fn);
+        }
+    } catch(hdf5_tools::Exception e) {
+        fprintf(stderr, "skipping invalid fast5 file: %s\n", fn.c_str());
     }
     delete fp;
 } // process_file
@@ -78,7 +84,7 @@ void index_path(ReadDB& read_db, const std::string& path)
                 // recurse
                 index_path(read_db, full_fn);
                 read_db.print_stats();
-            } else if (full_fn.find(".fast5") != -1 && 
fast5::File::is_valid_file(full_fn)) {
+            } else if (full_fn.find(".fast5") != -1) {
                 index_file(read_db, full_fn);
             }
         }
diff --git a/src/nanopolish_raw_loader.cpp b/src/nanopolish_raw_loader.cpp
index 9ed0076..0461a55 100644
--- a/src/nanopolish_raw_loader.cpp
+++ b/src/nanopolish_raw_loader.cpp
@@ -84,6 +84,7 @@ std::vector<AlignedPair> 
adaptive_banded_simple_event_align(SquiggleRead& read,
  
     // qc
     double min_average_log_emission = -5.0;
+    int max_gap_threshold = 50;
 
     // banding
     int bandwidth = 100;
@@ -315,6 +316,9 @@ std::vector<AlignedPair> 
adaptive_banded_simple_event_align(SquiggleRead& read,
 #ifdef DEBUG_ADAPTIVE
     fprintf(stderr, "[adaback] ei: %d ki: %d s: %.2f\n", curr_event_idx, 
curr_kmer_idx, max_score);
 #endif
+
+    int curr_gap = 0;
+    int max_gap = 0;
     while(curr_kmer_idx >= 0 && curr_event_idx >= 0) {
         
         // emit alignment
@@ -335,10 +339,14 @@ std::vector<AlignedPair> 
adaptive_banded_simple_event_align(SquiggleRead& read,
         if(from == FROM_D) {
             curr_kmer_idx -= 1;
             curr_event_idx -= 1;
+            curr_gap = 0;
         } else if(from == FROM_U) {
             curr_event_idx -= 1;
+            curr_gap = 0;
         } else {
             curr_kmer_idx -= 1;
+            curr_gap += 1;
+            max_gap = std::max(curr_gap, max_gap);
         }   
     }
     std::reverse(out.begin(), out.end());
@@ -348,12 +356,12 @@ std::vector<AlignedPair> 
adaptive_banded_simple_event_align(SquiggleRead& read,
     bool spanned = out.front().ref_pos == 0 && out.back().ref_pos == n_kmers - 
1;
     
     bool failed = false;
-    if(avg_log_emission < min_average_log_emission || !spanned) {
+    if(avg_log_emission < min_average_log_emission || !spanned || max_gap > 
max_gap_threshold) {
         failed = true;
         out.clear();
     }
 
-    //fprintf(stderr, "ada\t%s\t%s\t%.2lf\t%zu\t%.2lf\t%d\t%d\n", 
read.read_name.substr(0, 6).c_str(), failed ? "FAILED" : "OK", events_per_kmer, 
sequence.size(), avg_log_emission, curr_event_idx, fills);
+    //fprintf(stderr, "ada\t%s\t%s\t%.2lf\t%zu\t%.2lf\t%d\t%d\t%d\n", 
read.read_name.substr(0, 6).c_str(), failed ? "FAILED" : "OK", events_per_kmer, 
sequence.size(), avg_log_emission, curr_event_idx, max_gap, fills);
     return out;
 }
 
diff --git a/src/nanopolish_read_db.cpp b/src/nanopolish_read_db.cpp
index a064aee..d085b4d 100644
--- a/src/nanopolish_read_db.cpp
+++ b/src/nanopolish_read_db.cpp
@@ -112,11 +112,22 @@ void ReadDB::import_reads(const std::string& 
input_filename, const std::string&
 
     // Open writers
     FILE* write_fp = fopen(out_fasta_filename.c_str(), "w");
+    if(write_fp == NULL) {
+        fprintf(stderr, "error: could not open %s for write\n", 
out_fasta_filename.c_str());
+        exit(EXIT_FAILURE);
+    }
+
     BGZF* bgzf_write_fp = bgzf_dopen(fileno(write_fp), "w");
+    if(bgzf_write_fp == NULL) {
+        fprintf(stderr, "error: could not open %s for bgzipped write\n", 
out_fasta_filename.c_str());
+        exit(EXIT_FAILURE);
+    }
 
     // read input sequences, add to DB and convert to fasta
+    int ret = 0;
     kseq_t* seq = kseq_init(gz_read_fp);
-    while(kseq_read(seq) >= 0) {
+    while((ret = kseq_read(seq)) >= 0) {
+
         // Check for a path to the fast5 file in the comment of the read
         std::string path = "";
         if(seq->comment.l > 0) {
@@ -157,6 +168,12 @@ void ReadDB::import_reads(const std::string& 
input_filename, const std::string&
         }
     }
 
+    // check for abnormal exit conditions
+    if(ret <= -2) {
+        fprintf(stderr, "kseq_read returned %d indicating an error with the 
input file %s\n", ret, input_filename.c_str());
+        exit(EXIT_FAILURE);
+    }
+
     // cleanup
     kseq_destroy(seq);
     
diff --git a/src/nanopolish_squiggle_read.cpp b/src/nanopolish_squiggle_read.cpp
index 9378466..f09cc1d 100644
--- a/src/nanopolish_squiggle_read.cpp
+++ b/src/nanopolish_squiggle_read.cpp
@@ -31,6 +31,7 @@ int g_unparseable_reads = 0;
 int g_qc_fail_reads = 0;
 int g_failed_calibration_reads = 0;
 int g_failed_alignment_reads = 0;
+int g_bad_fast5_file = 0;
 
 const double MIN_CALIBRATION_VAR = 2.5;
 
@@ -73,28 +74,39 @@ SquiggleRead::SquiggleRead(const std::string& name, const 
ReadDB& read_db, const
     this->events_per_base[0] = events_per_base[1] = 0.0f;
     this->base_model[0] = this->base_model[1] = NULL;
     this->fast5_path = read_db.get_signal_path(this->read_name);
+    if(this->fast5_path == "") {
+        return;
+    }
 
     #pragma omp critical(sr_load_fast5)
     {
-        this->f_p = new fast5::File(fast5_path);
-        assert(this->f_p->is_open());
-
-        // Try to detect whether this read is DNA or RNA
-        this->nucleotide_type = SRNT_DNA;
-        if(this->f_p->have_context_tags_params()) {
-            fast5::Context_Tags_Params context_tags = 
this->f_p->get_context_tags_params();
-            std::string experiment_type = context_tags["experiment_type"];
-            if(experiment_type == "rna") {
-                this->nucleotide_type = SRNT_RNA;
+        // If for some reason the fast5s become unreadable in between indexing
+        // and executing the fast5::File constructor may throw. For example
+        // see issue #273. We catch here and skip the read in such cases
+        try {
+            this->f_p = new fast5::File(fast5_path);
+            assert(this->f_p->is_open());
+
+            // Try to detect whether this read is DNA or RNA
+            this->nucleotide_type = SRNT_DNA;
+            if(this->f_p->have_context_tags_params()) {
+                fast5::Context_Tags_Params context_tags = 
this->f_p->get_context_tags_params();
+                std::string experiment_type = context_tags["experiment_type"];
+                if(experiment_type == "rna") {
+                    this->nucleotide_type = SRNT_RNA;
+                }
             }
-        }
 
-        bool is_event_read = is_extract_read_name(this->read_name);
-        if(this->nucleotide_type == SRNT_DNA && is_event_read) {
-            load_from_events(flags);
-        } else {
-            this->read_sequence = read_db.get_read_sequence(read_name);
-            load_from_raw(flags);
+            bool is_event_read = is_extract_read_name(this->read_name);
+            if(this->nucleotide_type == SRNT_DNA && is_event_read) {
+                load_from_events(flags);
+            } else {
+                this->read_sequence = read_db.get_read_sequence(read_name);
+                load_from_raw(flags);
+            }
+        } catch(hdf5_tools::Exception e) {
+            fprintf(stderr, "[warning] fast5 file is unreadable and will be 
skipped: %s\n", fast5_path.c_str());
+            g_bad_fast5_file += 1;
         }
 
         delete this->f_p;

-- 
Alioth's /usr/local/bin/git-commit-notice on 
/srv/git.debian.org/git/debian-med/nanopolish.git

_______________________________________________
debian-med-commit mailing list
debian-med-commit@lists.alioth.debian.org
http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit

Reply via email to