[tesseract-ocr] Re: Unrecognized lines using psm 3
I found the function that puts everything on the table, with regard to the scrollview blob debug window... ccstruct/blobbox.cpp: ScrollView::Color BLOBNBOX::TextlineColor(BlobRegionType region_type, BlobTextFlowType flow_type) { switch (region_type) { case BRT_HLINE: return ScrollView::BROWN; case BRT_VLINE: return ScrollView::DARK_GREEN; case BRT_RECTIMAGE: return ScrollView::RED; case BRT_POLYIMAGE: return ScrollView::ORANGE; case BRT_UNKNOWN: return flow_type == BTFT_NONTEXT ? ScrollView::CYAN : ScrollView:: WHITE; case BRT_VERT_TEXT: if (flow_type == BTFT_STRONG_CHAIN || flow_type == BTFT_TEXT_ON_IMAGE) return ScrollView::GREEN; if (flow_type == BTFT_CHAIN) return ScrollView::LIME_GREEN; return ScrollView::YELLOW; case BRT_TEXT: if (flow_type == BTFT_STRONG_CHAIN) return ScrollView::BLUE; if (flow_type == BTFT_TEXT_ON_IMAGE) return ScrollView::LIGHT_BLUE; if (flow_type == BTFT_CHAIN) return ScrollView::MEDIUM_BLUE; if (flow_type == BTFT_LEADER) return ScrollView::WHEAT; if (flow_type == BTFT_NONTEXT) return ScrollView::PINK; return ScrollView::MAGENTA; default: return ScrollView::GREY; } } and some detailed description on what it all means... // The possible region types of a BLOBNBOX. // Note: keep all the text types > BRT_UNKNOWN and all the image types less. // Keep in sync with kBlobTypes in colpartition.cpp and BoxColor, and the // *Type static functions below. enum BlobRegionType { BRT_NOISE, // Neither text nor image. BRT_HLINE, // Horizontal separator line. BRT_VLINE, // Vertical separator line. BRT_RECTIMAGE, // Rectangular image. BRT_POLYIMAGE, // Non-rectangular image. BRT_UNKNOWN,// Not determined yet. BRT_VERT_TEXT, // Vertical alignment, not necessarily vertically oriented. BRT_TEXT, // Convincing text. BRT_COUNT // Number of possibilities. }; // BlobTextFlowType indicates the quality of neighbouring information // related to a chain of connected components, either horizontally or // vertically. Also used by ColPartition for the collection of blobs // within, which should all have the same value in most cases. enum BlobTextFlowType { BTFT_NONE, // No text flow set yet. BTFT_NONTEXT,// Flow too poor to be likely text. BTFT_NEIGHBOURS, // Neighbours support flow in this direction. BTFT_CHAIN, // There is a weak chain of text in this direction. BTFT_STRONG_CHAIN, // There is a strong chain of text in this direction. BTFT_TEXT_ON_IMAGE, // There is a strong chain of text on an image. BTFT_LEADER, // Leader dots/dashes etc. BTFT_COUNT }; So, it thinks there is an image in there somehow and all I did to fix it was to bypass an if statement. diff --git a/textord/colfind.cpp b/textord/colfind.cpp index ea5d73d..3b4246e 100644 --- a/textord/colfind.cpp +++ b/textord/colfind.cpp @@ -309,7 +309,7 @@ int ColumnFinder::FindBlocks(PageSegMode pageseg_mode, Pix* scaled_color, stroke_width_->GradeBlobsIntoPartitions( pageseg_mode, rerotate_, input_block, nontext_map_, denorm_, cjk_script_, &projection_, diacritic_blobs, &part_grid_, &big_parts_); - if (!PSM_SPARSE(pageseg_mode)) { + if (!PSM_SPARSE(pageseg_mode) && 0) { ImageFind::FindImagePartitions(photo_mask_pix, rotation_, rerotate_, input_block, this, &part_grid_, & big_parts_); ImageFind::TransferImagePartsToImageMask(rerotate_, &part_grid_, I think the `&& 0` should be replaced with an `&& init_var` and mainlined. Something like textord_imagefind. Any comments, suggestions? -- You received this message because you are subscribed to the Google Groups "tesseract-ocr" group. To unsubscribe from this group and stop receiving emails from it, send an email to tesseract-ocr+unsubscr...@googlegroups.com. To post to this group, send email to tesseract-ocr@googlegroups.com. Visit this group at https://groups.google.com/group/tesseract-ocr. To view this discussion on the web visit https://groups.google.com/d/msgid/tesseract-ocr/264c9e41-47ce-41d6-ab2a-4b8162550abe%40googlegroups.com. For more options, visit https://groups.google.com/d/optout.
[tesseract-ocr] Re: Unrecognized lines using psm 3
I found the function that puts everything on the table, with regard to the scrollview blob debug window... ccstruct/blobbox.cpp: ScrollView::Color BLOBNBOX::TextlineColor(BlobRegionType region_type, BlobTextFlowType flow_type) { switch (region_type) { case BRT_HLINE: return ScrollView::BROWN; case BRT_VLINE: return ScrollView::DARK_GREEN; case BRT_RECTIMAGE: return ScrollView::RED; case BRT_POLYIMAGE: return ScrollView::ORANGE; case BRT_UNKNOWN: return flow_type == BTFT_NONTEXT ? ScrollView::CYAN : ScrollView:: WHITE; case BRT_VERT_TEXT: if (flow_type == BTFT_STRONG_CHAIN || flow_type == BTFT_TEXT_ON_IMAGE) return ScrollView::GREEN; if (flow_type == BTFT_CHAIN) return ScrollView::LIME_GREEN; return ScrollView::YELLOW; case BRT_TEXT: if (flow_type == BTFT_STRONG_CHAIN) return ScrollView::BLUE; if (flow_type == BTFT_TEXT_ON_IMAGE) return ScrollView::LIGHT_BLUE; if (flow_type == BTFT_CHAIN) return ScrollView::MEDIUM_BLUE; if (flow_type == BTFT_LEADER) return ScrollView::WHEAT; if (flow_type == BTFT_NONTEXT) return ScrollView::PINK; return ScrollView::MAGENTA; default: return ScrollView::GREY; } } and some detailed description on what it all means... // The possible region types of a BLOBNBOX. // Note: keep all the text types > BRT_UNKNOWN and all the image types less. // Keep in sync with kBlobTypes in colpartition.cpp and BoxColor, and the // *Type static functions below. enum BlobRegionType { BRT_NOISE, // Neither text nor image. BRT_HLINE, // Horizontal separator line. BRT_VLINE, // Vertical separator line. BRT_RECTIMAGE, // Rectangular image. BRT_POLYIMAGE, // Non-rectangular image. BRT_UNKNOWN,// Not determined yet. BRT_VERT_TEXT, // Vertical alignment, not necessarily vertically oriented. BRT_TEXT, // Convincing text. BRT_COUNT // Number of possibilities. }; // BlobTextFlowType indicates the quality of neighbouring information // related to a chain of connected components, either horizontally or // vertically. Also used by ColPartition for the collection of blobs // within, which should all have the same value in most cases. enum BlobTextFlowType { BTFT_NONE, // No text flow set yet. BTFT_NONTEXT,// Flow too poor to be likely text. BTFT_NEIGHBOURS, // Neighbours support flow in this direction. BTFT_CHAIN, // There is a weak chain of text in this direction. BTFT_STRONG_CHAIN, // There is a strong chain of text in this direction. BTFT_TEXT_ON_IMAGE, // There is a strong chain of text on an image. BTFT_LEADER, // Leader dots/dashes etc. BTFT_COUNT }; So, it thinks there is an image in there somehow and all I did to fix it was to bypass an if statement. diff --git a/textord/colfind.cpp b/textord/colfind.cpp index ea5d73d..3b4246e 100644 --- a/textord/colfind.cpp +++ b/textord/colfind.cpp @@ -309,7 +309,7 @@ int ColumnFinder::FindBlocks(PageSegMode pageseg_mode, Pix* scaled_color, stroke_width_->GradeBlobsIntoPartitions( pageseg_mode, rerotate_, input_block, nontext_map_, denorm_, cjk_script_, &projection_, diacritic_blobs, &part_grid_, &big_parts_); - if (!PSM_SPARSE(pageseg_mode)) { + if (!PSM_SPARSE(pageseg_mode) && 0) { ImageFind::FindImagePartitions(photo_mask_pix, rotation_, rerotate_, input_block, this, &part_grid_, & big_parts_); ImageFind::TransferImagePartsToImageMask(rerotate_, &part_grid_, I think the `&& 0` should be replaced with an `|| init_var` and mainlined. Something like textord_disable_imagefind. Any comments, suggestions? -- You received this message because you are subscribed to the Google Groups "tesseract-ocr" group. To unsubscribe from this group and stop receiving emails from it, send an email to tesseract-ocr+unsubscr...@googlegroups.com. To post to this group, send email to tesseract-ocr@googlegroups.com. Visit this group at https://groups.google.com/group/tesseract-ocr. To view this discussion on the web visit https://groups.google.com/d/msgid/tesseract-ocr/f6177958-4e4f-4dfd-b05f-dfc7cd479930%40googlegroups.com. For more options, visit https://groups.google.com/d/optout.