Interesting. The Cells differ in sequence id. Would seem to imply race around the getting of sequenceid writing the WAL. The crashes are happening on regionserver replay of WAL files recovering crashed server? Or when is it happening? If the former, you might have to find the offending WAL and move it out of the line-of-action. Will need to replay the WAL subsequently but w/o the bad Cell. S
On Thu, Aug 15, 2019 at 8:57 AM Alexander Batyrshin <0x62...@gmail.com> wrote: > > Hello all, > > We observer error "Added a key not lexically larger than previous” that > cause most of our region-servers to crash in our cluster. > HBase-1.4.10 > > 2019-08-15 18:02:10,554 INFO [MemStoreFlusher.0] regionserver.HRegion: > Flushing 1/1 column families, memstore=56.08 MB > 2019-08-15 18:02:10,727 WARN [MemStoreFlusher.0] regionserver.HStore: > Failed flushing store file, retrying num=0 > java.io.IOException: Added a key not lexically larger than previous. > Current cell = > \x0901820448218>wGavb'/d:elr/1565881054828/DeleteColumn/vlen=0/seqid=44456567, > lastCell = > \x0901820448218>wGavb'/d:elr/1565881054828/Put/vlen=1/seqid=44457770 > at org.apache.hadoop.hbase.io > .hfile.AbstractHFileWriter.checkKey(AbstractHFileWriter.java:204) > at org.apache.hadoop.hbase.io > .hfile.HFileWriterV2.append(HFileWriterV2.java:279) > at org.apache.hadoop.hbase.io > .hfile.HFileWriterV3.append(HFileWriterV3.java:87) > at > org.apache.hadoop.hbase.regionserver.StoreFile$Writer.append(StoreFile.java:1127) > at > org.apache.hadoop.hbase.regionserver.StoreFlusher.performFlush(StoreFlusher.java:139) > at > org.apache.hadoop.hbase.regionserver.DefaultStoreFlusher.flushSnapshot(DefaultStoreFlusher.java:75) > at > org.apache.hadoop.hbase.regionserver.HStore.flushCache(HStore.java:1003) > at > org.apache.hadoop.hbase.regionserver.HStore$StoreFlusherImpl.flushCache(HStore.java:2523) > at > org.apache.hadoop.hbase.regionserver.HRegion.internalFlushCacheAndCommit(HRegion.java:2622) > at > org.apache.hadoop.hbase.regionserver.HRegion.internalFlushcache(HRegion.java:2352) > at > org.apache.hadoop.hbase.regionserver.HRegion.internalFlushcache(HRegion.java:2314) > at > org.apache.hadoop.hbase.regionserver.HRegion.flushcache(HRegion.java:2200) > at > org.apache.hadoop.hbase.regionserver.HRegion.flush(HRegion.java:2125) > at > org.apache.hadoop.hbase.regionserver.MemStoreFlusher.flushRegion(MemStoreFlusher.java:512) > at > org.apache.hadoop.hbase.regionserver.MemStoreFlusher.flushRegion(MemStoreFlusher.java:482) > at > org.apache.hadoop.hbase.regionserver.MemStoreFlusher.access$900(MemStoreFlusher.java:76) > at > org.apache.hadoop.hbase.regionserver.MemStoreFlusher$FlushHandler.run(MemStoreFlusher.java:264) > at java.lang.Thread.run(Thread.java:748) > > 2019-08-15 18:02:21,776 WARN [MemStoreFlusher.0] regionserver.HStore: > Failed flushing store file, retrying num=9 > java.io.IOException: Added a key not lexically larger than previous. > Current cell = > \x0901820448218>wGavb'/d:elr/1565881054828/DeleteColumn/vlen=0/seqid=44456567, > lastCell = > \x0901820448218>wGavb'/d:elr/1565881054828/Put/vlen=1/seqid=44457770 > at org.apache.hadoop.hbase.io > .hfile.AbstractHFileWriter.checkKey(AbstractHFileWriter.java:204) > at org.apache.hadoop.hbase.io > .hfile.HFileWriterV2.append(HFileWriterV2.java:279) > at org.apache.hadoop.hbase.io > .hfile.HFileWriterV3.append(HFileWriterV3.java:87) > at > org.apache.hadoop.hbase.regionserver.StoreFile$Writer.append(StoreFile.java:1127) > at > org.apache.hadoop.hbase.regionserver.StoreFlusher.performFlush(StoreFlusher.java:139) > at > org.apache.hadoop.hbase.regionserver.DefaultStoreFlusher.flushSnapshot(DefaultStoreFlusher.java:75) > at > org.apache.hadoop.hbase.regionserver.HStore.flushCache(HStore.java:1003) > at > org.apache.hadoop.hbase.regionserver.HStore$StoreFlusherImpl.flushCache(HStore.java:2523) > at > org.apache.hadoop.hbase.regionserver.HRegion.internalFlushCacheAndCommit(HRegion.java:2622) > at > org.apache.hadoop.hbase.regionserver.HRegion.internalFlushcache(HRegion.java:2352) > at > org.apache.hadoop.hbase.regionserver.HRegion.internalFlushcache(HRegion.java:2314) > at > org.apache.hadoop.hbase.regionserver.HRegion.flushcache(HRegion.java:2200) > at > org.apache.hadoop.hbase.regionserver.HRegion.flush(HRegion.java:2125) > at > org.apache.hadoop.hbase.regionserver.MemStoreFlusher.flushRegion(MemStoreFlusher.java:512) > at > org.apache.hadoop.hbase.regionserver.MemStoreFlusher.flushRegion(MemStoreFlusher.java:482) > at > org.apache.hadoop.hbase.regionserver.MemStoreFlusher.access$900(MemStoreFlusher.java:76) > at > org.apache.hadoop.hbase.regionserver.MemStoreFlusher$FlushHandler.run(MemStoreFlusher.java:264) > at java.lang.Thread.run(Thread.java:748) > > 2019-08-15 18:02:21,777 FATAL [MemStoreFlusher.0] > regionserver.HRegionServer: ABORTING region server > prod006,60020,1565873610692: Replay of WAL required. Forcing server shutdown > org.apache.hadoop.hbase.DroppedSnapshotException: region: > TBL_TABLE_CODE,\x0904606203097821slG=sPD,1563070299676.5110b3395ca64a51cea99c6572a4c3d9. > at > org.apache.hadoop.hbase.regionserver.HRegion.internalFlushCacheAndCommit(HRegion.java:2675) > at > org.apache.hadoop.hbase.regionserver.HRegion.internalFlushcache(HRegion.java:2352) > at > org.apache.hadoop.hbase.regionserver.HRegion.internalFlushcache(HRegion.java:2314) > at > org.apache.hadoop.hbase.regionserver.HRegion.flushcache(HRegion.java:2200) > at > org.apache.hadoop.hbase.regionserver.HRegion.flush(HRegion.java:2125) > at > org.apache.hadoop.hbase.regionserver.MemStoreFlusher.flushRegion(MemStoreFlusher.java:512) > at > org.apache.hadoop.hbase.regionserver.MemStoreFlusher.flushRegion(MemStoreFlusher.java:482) > at > org.apache.hadoop.hbase.regionserver.MemStoreFlusher.access$900(MemStoreFlusher.java:76) > at > org.apache.hadoop.hbase.regionserver.MemStoreFlusher$FlushHandler.run(MemStoreFlusher.java:264) > at java.lang.Thread.run(Thread.java:748) > Caused by: java.io.IOException: Added a key not lexically larger than > previous. Current cell = > \x0901820448218>wGavb'/d:elr/1565881054828/DeleteColumn/vlen=0/seqid=44456567, > lastCell = > \x0901820448218>wGavb'/d:elr/1565881054828/Put/vlen=1/seqid=44457770 > at org.apache.hadoop.hbase.io > .hfile.AbstractHFileWriter.checkKey(AbstractHFileWriter.java:204) > at org.apache.hadoop.hbase.io > .hfile.HFileWriterV2.append(HFileWriterV2.java:279) > at org.apache.hadoop.hbase.io > .hfile.HFileWriterV3.append(HFileWriterV3.java:87) > at > org.apache.hadoop.hbase.regionserver.StoreFile$Writer.append(StoreFile.java:1127) > at > org.apache.hadoop.hbase.regionserver.StoreFlusher.performFlush(StoreFlusher.java:139) > at > org.apache.hadoop.hbase.regionserver.DefaultStoreFlusher.flushSnapshot(DefaultStoreFlusher.java:75) > at > org.apache.hadoop.hbase.regionserver.HStore.flushCache(HStore.java:1003) > at > org.apache.hadoop.hbase.regionserver.HStore$StoreFlusherImpl.flushCache(HStore.java:2523) > at > org.apache.hadoop.hbase.regionserver.HRegion.internalFlushCacheAndCommit(HRegion.java:2622) > ... 9 more > >