[tesseract-ocr] Re: Unrecognized lines using psm 3

2016-09-02 Thread fuzzy7k
I found the function that puts everything on the table, with regard to the 
scrollview blob debug window...
ccstruct/blobbox.cpp:
ScrollView::Color BLOBNBOX::TextlineColor(BlobRegionType region_type,
  BlobTextFlowType flow_type) {
  switch (region_type) {
case BRT_HLINE:
  return ScrollView::BROWN;
case BRT_VLINE:
  return ScrollView::DARK_GREEN;
case BRT_RECTIMAGE:
  return ScrollView::RED;
case BRT_POLYIMAGE:
  return ScrollView::ORANGE;
case BRT_UNKNOWN:
  return flow_type == BTFT_NONTEXT ? ScrollView::CYAN : ScrollView::
WHITE;
case BRT_VERT_TEXT:
  if (flow_type == BTFT_STRONG_CHAIN || flow_type == BTFT_TEXT_ON_IMAGE)
return ScrollView::GREEN;
  if (flow_type == BTFT_CHAIN)
return ScrollView::LIME_GREEN;
  return ScrollView::YELLOW;
case BRT_TEXT:
  if (flow_type == BTFT_STRONG_CHAIN)
return ScrollView::BLUE;
  if (flow_type == BTFT_TEXT_ON_IMAGE)
return ScrollView::LIGHT_BLUE;
  if (flow_type == BTFT_CHAIN)
return ScrollView::MEDIUM_BLUE;
  if (flow_type == BTFT_LEADER)
return ScrollView::WHEAT;
  if (flow_type == BTFT_NONTEXT)
return ScrollView::PINK;
  return ScrollView::MAGENTA;
default:
  return ScrollView::GREY;
  }
}


and some detailed description on what it all means...

// The possible region types of a BLOBNBOX.
// Note: keep all the text types > BRT_UNKNOWN and all the image types less.
// Keep in sync with kBlobTypes in colpartition.cpp and BoxColor, and the
// *Type static functions below.
enum BlobRegionType {
  BRT_NOISE,  // Neither text nor image.
  BRT_HLINE,  // Horizontal separator line.
  BRT_VLINE,  // Vertical separator line.
  BRT_RECTIMAGE,  // Rectangular image.
  BRT_POLYIMAGE,  // Non-rectangular image.
  BRT_UNKNOWN,// Not determined yet.
  BRT_VERT_TEXT,  // Vertical alignment, not necessarily vertically 
oriented.
  BRT_TEXT,   // Convincing text.
  BRT_COUNT   // Number of possibilities.
};

// BlobTextFlowType indicates the quality of neighbouring information
// related to a chain of connected components, either horizontally or
// vertically. Also used by ColPartition for the collection of blobs
// within, which should all have the same value in most cases.
enum BlobTextFlowType {
  BTFT_NONE,   // No text flow set yet.
  BTFT_NONTEXT,// Flow too poor to be likely text.
  BTFT_NEIGHBOURS, // Neighbours support flow in this direction.
  BTFT_CHAIN,  // There is a weak chain of text in this direction.
  BTFT_STRONG_CHAIN,   // There is a strong chain of text in this direction.
  BTFT_TEXT_ON_IMAGE,  // There is a strong chain of text on an image.
  BTFT_LEADER, // Leader dots/dashes etc.
  BTFT_COUNT
};


So, it thinks there is an image in there somehow and all I did to fix it 
was to bypass an if statement.

diff --git a/textord/colfind.cpp b/textord/colfind.cpp
index ea5d73d..3b4246e 100644
--- a/textord/colfind.cpp
+++ b/textord/colfind.cpp
@@ -309,7 +309,7 @@ int ColumnFinder::FindBlocks(PageSegMode pageseg_mode, 
Pix* scaled_color,
   stroke_width_->GradeBlobsIntoPartitions(
   pageseg_mode, rerotate_, input_block, nontext_map_, denorm_, 
cjk_script_,
   &projection_, diacritic_blobs, &part_grid_, &big_parts_);
-  if (!PSM_SPARSE(pageseg_mode)) {
+  if (!PSM_SPARSE(pageseg_mode) && 0) {
 ImageFind::FindImagePartitions(photo_mask_pix, rotation_, rerotate_,
input_block, this, &part_grid_, &
big_parts_);
 ImageFind::TransferImagePartsToImageMask(rerotate_, &part_grid_,

I think the `&& 0` should be replaced with an `&& init_var` and mainlined. 
Something like textord_imagefind. Any comments, suggestions?


-- 
You received this message because you are subscribed to the Google Groups 
"tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to tesseract-ocr+unsubscr...@googlegroups.com.
To post to this group, send email to tesseract-ocr@googlegroups.com.
Visit this group at https://groups.google.com/group/tesseract-ocr.
To view this discussion on the web visit 
https://groups.google.com/d/msgid/tesseract-ocr/264c9e41-47ce-41d6-ab2a-4b8162550abe%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.


[tesseract-ocr] Re: Unrecognized lines using psm 3

2016-09-02 Thread fuzzy7k
I found the function that puts everything on the table, with regard to the 
scrollview blob debug window...
ccstruct/blobbox.cpp:
ScrollView::Color BLOBNBOX::TextlineColor(BlobRegionType region_type,
  BlobTextFlowType flow_type) {
  switch (region_type) {
case BRT_HLINE:
  return ScrollView::BROWN;
case BRT_VLINE:
  return ScrollView::DARK_GREEN;
case BRT_RECTIMAGE:
  return ScrollView::RED;
case BRT_POLYIMAGE:
  return ScrollView::ORANGE;
case BRT_UNKNOWN:
  return flow_type == BTFT_NONTEXT ? ScrollView::CYAN : ScrollView::
WHITE;
case BRT_VERT_TEXT:
  if (flow_type == BTFT_STRONG_CHAIN || flow_type == BTFT_TEXT_ON_IMAGE)
return ScrollView::GREEN;
  if (flow_type == BTFT_CHAIN)
return ScrollView::LIME_GREEN;
  return ScrollView::YELLOW;
case BRT_TEXT:
  if (flow_type == BTFT_STRONG_CHAIN)
return ScrollView::BLUE;
  if (flow_type == BTFT_TEXT_ON_IMAGE)
return ScrollView::LIGHT_BLUE;
  if (flow_type == BTFT_CHAIN)
return ScrollView::MEDIUM_BLUE;
  if (flow_type == BTFT_LEADER)
return ScrollView::WHEAT;
  if (flow_type == BTFT_NONTEXT)
return ScrollView::PINK;
  return ScrollView::MAGENTA;
default:
  return ScrollView::GREY;
  }
}


and some detailed description on what it all means...

// The possible region types of a BLOBNBOX.
// Note: keep all the text types > BRT_UNKNOWN and all the image types less.
// Keep in sync with kBlobTypes in colpartition.cpp and BoxColor, and the
// *Type static functions below.
enum BlobRegionType {
  BRT_NOISE,  // Neither text nor image.
  BRT_HLINE,  // Horizontal separator line.
  BRT_VLINE,  // Vertical separator line.
  BRT_RECTIMAGE,  // Rectangular image.
  BRT_POLYIMAGE,  // Non-rectangular image.
  BRT_UNKNOWN,// Not determined yet.
  BRT_VERT_TEXT,  // Vertical alignment, not necessarily vertically 
oriented.
  BRT_TEXT,   // Convincing text.
  BRT_COUNT   // Number of possibilities.
};

// BlobTextFlowType indicates the quality of neighbouring information
// related to a chain of connected components, either horizontally or
// vertically. Also used by ColPartition for the collection of blobs
// within, which should all have the same value in most cases.
enum BlobTextFlowType {
  BTFT_NONE,   // No text flow set yet.
  BTFT_NONTEXT,// Flow too poor to be likely text.
  BTFT_NEIGHBOURS, // Neighbours support flow in this direction.
  BTFT_CHAIN,  // There is a weak chain of text in this direction.
  BTFT_STRONG_CHAIN,   // There is a strong chain of text in this direction.
  BTFT_TEXT_ON_IMAGE,  // There is a strong chain of text on an image.
  BTFT_LEADER, // Leader dots/dashes etc.
  BTFT_COUNT
};


So, it thinks there is an image in there somehow and all I did to fix it 
was to bypass an if statement.

diff --git a/textord/colfind.cpp b/textord/colfind.cpp
index ea5d73d..3b4246e 100644
--- a/textord/colfind.cpp
+++ b/textord/colfind.cpp
@@ -309,7 +309,7 @@ int ColumnFinder::FindBlocks(PageSegMode pageseg_mode, 
Pix* scaled_color,
   stroke_width_->GradeBlobsIntoPartitions(
   pageseg_mode, rerotate_, input_block, nontext_map_, denorm_, 
cjk_script_,
   &projection_, diacritic_blobs, &part_grid_, &big_parts_);
-  if (!PSM_SPARSE(pageseg_mode)) {
+  if (!PSM_SPARSE(pageseg_mode) && 0) {
 ImageFind::FindImagePartitions(photo_mask_pix, rotation_, rerotate_,
input_block, this, &part_grid_, &
big_parts_);
 ImageFind::TransferImagePartsToImageMask(rerotate_, &part_grid_,

I think the `&& 0` should be replaced with an `|| init_var` and mainlined. 
Something like textord_disable_imagefind. Any comments, suggestions?

-- 
You received this message because you are subscribed to the Google Groups 
"tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to tesseract-ocr+unsubscr...@googlegroups.com.
To post to this group, send email to tesseract-ocr@googlegroups.com.
Visit this group at https://groups.google.com/group/tesseract-ocr.
To view this discussion on the web visit 
https://groups.google.com/d/msgid/tesseract-ocr/f6177958-4e4f-4dfd-b05f-dfc7cd479930%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.