Index: ruby/test/unit/largefile/tc_sample_large.rb
===================================================================
--- ruby/test/unit/largefile/tc_sample_large.rb	(revision 0)
+++ ruby/test/unit/largefile/tc_sample_large.rb	(revision 0)
@@ -0,0 +1,46 @@
+require File.dirname(__FILE__) + "/../../test_helper"
+
+class SampleLargeTest < Test::Unit::TestCase
+  include Ferret::Index
+  include Ferret::Search
+  include Ferret::Store
+  include Ferret::Utils
+  
+  INDEX_DIR = File.dirname(__FILE__) + "/../../temp/largefile"
+  RECORDS = 750
+  RECORD_SIZE = 10e5
+  
+  def setup
+    @index = Index.new(:path => INDEX_DIR, :create_if_missing => true, :key => :id)
+    create_index! if @index.size == 0 or ENV["RELOAD_LARGE_INDEX"]
+  end
+
+  def test_file_index_created
+    assert @index.size == RECORDS, "Index size should be #{RECORDS}, is #{@index.size}"
+  end
+  
+  def test_keys_work
+    @index << {:content => "foo", :id => RECORDS - 4}
+    assert @index.size == RECORDS, "Index size should be #{RECORDS}, is #{@index.size}"
+  end
+  
+  def test_read_file_after_two_gigs
+    assert @index.reader[RECORDS - 5].load.is_a?Hash
+  end
+  
+  def create_index!
+    @@already_built_large_index ||= false
+    return if @@already_built_large_index
+    @@already_built_large_index = true
+    a = "a"
+    RECORDS.times { |i|
+      seq = (a.succ! + " ") * RECORD_SIZE
+      record = {:id => i, :content => seq}
+    	@index << record
+    	print "i"
+    	STDOUT.flush
+    }
+    puts "o"
+    @index.optimize
+  end
+end
Index: ruby/test/unit/ts_largefile.rb
===================================================================
--- ruby/test/unit/ts_largefile.rb	(revision 0)
+++ ruby/test/unit/ts_largefile.rb	(revision 0)
@@ -0,0 +1,2 @@
+require File.join(File.dirname(__FILE__), "../test_helper.rb")
+load_test_dir('unit/largefile')
Index: c/include/index.h
===================================================================
--- c/include/index.h	(revision 770)
+++ c/include/index.h	(working copy)
@@ -456,11 +456,11 @@
 
 typedef struct Offset
 {
-    int start;
-    int end;
+    off_t start;
+    off_t end;
 } Offset;
 
-extern Offset *offset_new(int start, int end);
+extern Offset *offset_new(off_t start, off_t end);
 
 /****************************************************************************
  *
@@ -617,7 +617,7 @@
 /* * * LazyDocField * * */
 typedef struct LazyDocFieldData
 {
-    int   start;
+    off_t   start;
     int   length;
     char *text;
 } LazyDocFieldData;
Index: c/src/index.c
===================================================================
--- c/src/index.c	(revision 770)
+++ c/src/index.c	(working copy)
@@ -1203,7 +1203,8 @@
         if (NULL == text) {
             const int read_len = self->data[i].length + 1;
             self->data[i].text = text = ALLOC_N(char, read_len);
-            is_seek(self->doc->fields_in, self->data[i].start);
+            //KYLE_DEBUG
+is_seek(self->doc->fields_in, self->data[i].start);
             is_read_bytes(self->doc->fields_in, (uchar *)text, read_len);
             text[read_len - 1] = '\0';
         }
@@ -1225,7 +1226,8 @@
         RAISE(IO_ERROR, "Tried to read past end of field. Field is only %d "
               "bytes long but tried to read to %d", self->len, start + len);
     }
-    is_seek(self->doc->fields_in, self->data[0].start + start);
+    //KYLE_DEBUG
+is_seek(self->doc->fields_in, self->data[0].start + start);
     is_read_bytes(self->doc->fields_in, (uchar *)buf, len);
 }
 
@@ -1330,9 +1332,11 @@
     InStream *fdx_in = fr->fdx_in;
     InStream *fdt_in = fr->fdt_in;
 
-    is_seek(fdx_in, doc_num * FIELDS_IDX_PTR_SIZE);
+    //KYLE_DEBUG
+is_seek(fdx_in, doc_num * FIELDS_IDX_PTR_SIZE);
     pos = (off_t)is_read_u64(fdx_in);
-    is_seek(fdt_in, pos);
+    //KYLE_DEBUG
+is_seek(fdt_in, pos);
     stored_cnt = is_read_vint(fdt_in);
 
     for (i = 0; i < stored_cnt; i++) {
@@ -1368,14 +1372,16 @@
     InStream *fdx_in = fr->fdx_in;
     InStream *fdt_in = fr->fdt_in;
 
-    is_seek(fdx_in, doc_num * FIELDS_IDX_PTR_SIZE);
+    //KYLE_DEBUG
+is_seek(fdx_in, doc_num * FIELDS_IDX_PTR_SIZE);
     pos = (off_t)is_read_u64(fdx_in);
-    is_seek(fdt_in, pos);
+    //KYLE_DEBUG
+is_seek(fdt_in, pos);
     stored_cnt = is_read_vint(fdt_in);
     lazy_doc = lazy_doc_new(stored_cnt, fdt_in);
 
     for (i = 0; i < stored_cnt; i++) {
-        int start = 0, end, data_cnt;
+        off_t start = 0, end, data_cnt;
         field_num = is_read_vint(fdt_in);
         fi = fr->fis->fields[field_num];
         data_cnt = is_read_vint(fdt_in);
@@ -1396,7 +1402,8 @@
         }
 
         lazy_doc_add_field(lazy_doc, lazy_df, i);
-        is_seek(fdt_in, end);
+        //KYLE_DEBUG
+is_seek(fdt_in, end);
     }
 
     return lazy_doc;
@@ -1449,10 +1456,10 @@
         if (store_offsets) {
             int num_positions = tv->offset_cnt = is_read_vint(fdt_in);
             Offset *offsets = tv->offsets = ALLOC_N(Offset, num_positions);
-            int offset = 0;
+            off_t offset = 0;
             for (i = 0; i < num_positions; i++) {
-                offsets[i].start = offset += is_read_vint(fdt_in);
-                offsets[i].end = offset += is_read_vint(fdt_in);
+                offsets[i].start = offset += (off_t) is_read_vint(fdt_in);
+                offsets[i].end = offset += (off_t) is_read_vint(fdt_in);
             }
         }
     }
@@ -1470,13 +1477,15 @@
     int *field_nums;
 
     if (doc_num >= 0 && doc_num < fr->size) {
-        is_seek(fdx_in, FIELDS_IDX_PTR_SIZE * doc_num);
+        //KYLE_DEBUG
+is_seek(fdx_in, FIELDS_IDX_PTR_SIZE * doc_num);
 
         data_ptr = (off_t)is_read_u64(fdx_in);
         field_index_ptr = data_ptr += (off_t)is_read_u32(fdx_in);
 
         /* scan fields to get position of field_num's term vector */
-        is_seek(fdt_in, field_index_ptr);
+        //KYLE_DEBUG
+is_seek(fdt_in, field_index_ptr);
 
         field_cnt = is_read_vint(fdt_in);
         field_nums = ALLOC_N(int, field_cnt);
@@ -1487,7 +1496,8 @@
             tv_size = is_read_vint(fdt_in);
             data_ptr -= tv_size;
         }
-        is_seek(fdt_in, data_ptr);
+        //KYLE_DEBUG
+is_seek(fdt_in, data_ptr);
 
         for (i = 0; i < field_cnt; i++) {
             TermVector *tv = fr_read_term_vector(fr, field_nums[i]);
@@ -1510,13 +1520,15 @@
         InStream *fdx_in = fr->fdx_in;
         InStream *fdt_in = fr->fdt_in;
 
-        is_seek(fdx_in, FIELDS_IDX_PTR_SIZE * doc_num);
+        //KYLE_DEBUG
+is_seek(fdx_in, FIELDS_IDX_PTR_SIZE * doc_num);
 
         field_index_ptr =  (off_t)is_read_u64(fdx_in);
         field_index_ptr += (off_t)is_read_u32(fdx_in);
 
         /* scan fields to get position of field_num's term vector */
-        is_seek(fdt_in, field_index_ptr);
+        //KYLE_DEBUG
+is_seek(fdt_in, field_index_ptr);
 
         field_cnt = is_read_vint(fdt_in);
         for (i = field_cnt - 1; i >= 0 && fnum != field_num; i--) {
@@ -1526,7 +1538,8 @@
 
         if (fnum == field_num) {
             /* field was found */
-            is_seek(fdt_in, field_index_ptr - (off_t)offset);
+            //KYLE_DEBUG
+is_seek(fdt_in, field_index_ptr - (off_t)offset);
             tv = fr_read_term_vector(fr, field_num);
         }
     }
@@ -1683,8 +1696,8 @@
         int last_end = 0;
         os_write_vint(fdt_out, offset_count);  /* write shared prefix length */
         for (i = 0; i < offset_count; i++) {
-            int start = offsets[i].start;
-            int end = offsets[i].end;
+            off_t start = offsets[i].start;
+            off_t end = offsets[i].end;
             os_write_vint(fdt_out, start - last_end);
             os_write_vint(fdt_out, end - start);
             last_end = end;
@@ -1759,7 +1772,8 @@
         int index_size = sti->index_size;
         off_t index_ptr = 0;
         ste_reset(index_te);
-        is_seek(STE(index_te)->is, sti->index_ptr);
+        //KYLE_DEBUG
+is_seek(STE(index_te)->is, sti->index_ptr);
         STE(index_te)->size = sti->index_size;
         
         sti->index_terms = ALLOC_N(char *, index_size);
@@ -1914,7 +1928,8 @@
     te->field_num = field_num;
     if (sti) {
         STE(te)->size = sti->size;
-        is_seek(STE(te)->is, sti->ptr);
+        //KYLE_DEBUG
+is_seek(STE(te)->is, sti->ptr);
     }
     else {
         STE(te)->size = 0;
@@ -1925,7 +1940,8 @@
 static void ste_index_seek(TermEnum *te, SegmentTermIndex *sti, int idx_offset)
 {
     int term_len = sti->index_term_lens[idx_offset];
-    is_seek(STE(te)->is, sti->index_ptrs[idx_offset]);
+    //KYLE_DEBUG
+is_seek(STE(te)->is, sti->index_ptrs[idx_offset]);
     STE(te)->pos = STE(te)->sfi->index_interval * idx_offset - 1;
     memcpy(te->curr_term,
            sti->index_terms[idx_offset],
@@ -2570,7 +2586,8 @@
         stde->frq_ptr = ti->frq_ptr;
         stde->prx_ptr = ti->prx_ptr;
         stde->skip_ptr = ti->frq_ptr + ti->skip_offset;
-        is_seek(stde->frq_in, ti->frq_ptr);
+        //KYLE_DEBUG
+is_seek(stde->frq_in, ti->frq_ptr);
         stde->have_skipped = false;
     }
 }
@@ -2680,7 +2697,8 @@
 
         //printf("skip_ptr = %lld\n", stde->skip_ptr);
         if (!stde->have_skipped) {                 /* lazily seek skip stream */
-            is_seek(stde->skip_in, stde->skip_ptr);
+            //KYLE_DEBUG
+is_seek(stde->skip_in, stde->skip_ptr);
             stde->have_skipped = true;
         }
 
@@ -2714,7 +2732,8 @@
 
         /* if we found something to skip, skip it */
         if (last_frq_ptr > is_pos(stde->frq_in)) {
-            is_seek(stde->frq_in, last_frq_ptr);
+            //KYLE_DEBUG
+is_seek(stde->frq_in, last_frq_ptr);
             stde->seek_prox(stde, last_prx_ptr);
 
             stde->doc_num = last_skip_doc;
@@ -2797,7 +2816,8 @@
     }
     else {
         stde_seek_ti(stde, ti);
-        is_seek(stde->prx_in, ti->prx_ptr);
+        //KYLE_DEBUG
+is_seek(stde->prx_in, ti->prx_ptr);
     }
 }
 
@@ -2855,7 +2875,8 @@
 
 static void stpe_seek_prox(SegmentTermDocEnum *stde, off_t prx_ptr)
 {
-    is_seek(stde->prx_in, prx_ptr);
+    //KYLE_DEBUG
+is_seek(stde->prx_in, prx_ptr);
     stde->prx_cnt = 0;
 }
 
@@ -3949,7 +3970,8 @@
     else {
         InStream *norm_in = is_clone(norm->is);
         /* read from disk */
-        is_seek(norm_in, 0);
+        //KYLE_DEBUG
+is_seek(norm_in, 0);
         is_read_bytes(norm_in, buf, SR_SIZE(sr));
         is_close(norm_in);
     }
@@ -4799,7 +4821,7 @@
  *
  ****************************************************************************/
 
-Offset *offset_new(int start, int end)
+Offset *offset_new(off_t start, off_t end)
 {
     Offset *offset = ALLOC(Offset);
     offset->start = start;
@@ -5204,7 +5226,7 @@
     int doc_num = dw->doc_num;
     int i;
     const int df_size = df->size;
-    int start_offset = 0;
+    off_t start_offset = 0;
 
     if (fld_inv->is_tokenized) {
         Token *tk;
@@ -5533,7 +5555,8 @@
             if (!smi->deleted_docs || !bv_get(smi->deleted_docs, j)) {
                 os_write_u64(fdx_out, os_pos(fdt_out));
                 os_write_u32(fdx_out, tv_idx_offset);
-                is_seek(fdt_in, start);
+                //KYLE_DEBUG
+is_seek(fdt_in, start);
                 is2os_copy_bytes(fdt_in, fdt_out, end - start);
             }
         }
Index: c/src/term_vectors.c
===================================================================
--- c/src/term_vectors.c	(revision 770)
+++ c/src/term_vectors.c	(working copy)
@@ -174,13 +174,15 @@
     int *field_nums;
 
     if (doc_num >= 0 && doc_num < tvr->size) {
-        is_seek(tvx_in, 12 * doc_num);
+        //KYLE_DEBUG
+is_seek(tvx_in, 12 * doc_num);
 
         data_ptr = (off_t)is_read_u64(tvx_in);
         field_index_ptr = data_ptr + (off_t)is_read_u32(tvx_in);
 
         /* scan fields to get position of field_num's term vector */
-        is_seek(tvd_in, field_index_ptr);
+        //KYLE_DEBUG
+is_seek(tvd_in, field_index_ptr);
 
         field_cnt = is_read_vint(tvd_in);
         field_nums = ALLOC_N(int, field_cnt);
@@ -189,7 +191,8 @@
             field_nums[i] = is_read_vint(tvd_in);
             is_read_vint(tvd_in); /* skip space, we don't need it */
         }
-        is_seek(tvd_in, data_ptr);
+        //KYLE_DEBUG
+is_seek(tvd_in, data_ptr);
 
         for (i = 0; i < field_cnt; i++) {
             TermVector *tv = tvr_read_term_vector(tvr, field_nums[i]);
@@ -213,13 +216,15 @@
     TermVector *tv = NULL;
 
     if (doc_num >= 0 && doc_num < tvr->size) {
-        is_seek(tvx_in, 12 * doc_num);
+        //KYLE_DEBUG
+is_seek(tvx_in, 12 * doc_num);
 
         data_ptr = (off_t)is_read_u64(tvx_in);
         field_index_ptr = data_ptr + (off_t)is_read_u32(tvx_in);
 
         /* scan fields to get position of field_num's term vector */
-        is_seek(tvd_in, field_index_ptr);
+        //KYLE_DEBUG
+is_seek(tvd_in, field_index_ptr);
 
         field_cnt = is_read_vint(tvd_in);
         for (i = 0; i < field_cnt; i++) {
@@ -230,7 +235,8 @@
         }
         if (i < field_cnt) {
             /* field was found */
-            is_seek(tvd_in, data_ptr + offset);
+            //KYLE_DEBUG
+is_seek(tvd_in, data_ptr + offset);
             tv = tvr_read_term_vector(tvr, field_num);
         }
     }
