diff --git a/Tess4J/build.xml b/Tess4J/build.xml
deleted file mode 100644
index bd1e903..0000000
--- a/Tess4J/build.xml
+++ /dev/null
@@ -1,116 +0,0 @@
-
-
-
-
-
-
-
-
-
-
- Tess4J is a JNA wrapper for Tesseract OCR - API; it provides character recognition support for common image formats, - multi-page images, and PDF documents. The library has been developed and tested - on Windows and Linux. -
-- Tess4J is released and distributed under the - Apache License, v2.0. Its official homepage is at - http://tess4j.sourceforge.net. -
-- Java Runtime Environment, - JNA, and JAI-ImageIO - are required. Apache Ant and - JUnit are used for program building and unit testing. The Tesseract DLLs - were built with VS2015 and therefore depend on the - Visual C++ 2015 Redistributable Packages. -
-
- Tesseract 3.05.01 and Leptonica 1.74.4 (via Lept4J) 32- and 64-bit
- DLLs, language data for English, and sample images are bundled with the library.
- Language data packs for
- Tesseract should be decompressed and placed into the tessdata
folder.
-
- The Linux shared object library (libtesseract.so
) equivalent to the
- DLL is available in Tesseract 3.05.01, which can be built from the source with the instructions given in Tesseract Wiki.
-
- To unit test, at the command line, execute: -
----
-ant test
-
- Support for PDF documents is available through either - GPL Ghostscript, which should be installed and included - in system path, or PDFBox, if Ghostscript is not available. -
-- Images to be OCRed should be scanned at resolution from at least 200 DPI (dot per - inch) to 400 DPI in monochrome (black&white) or grayscale. Scanning at higher - resolutions will not necessarily result in better recognition accuracy. The actual - success rates depend greatly on the quality of the scanned image. The typical settings - for scanning are 300 DPI and 1 bpp (bit per pixel) black&white or 8 bpp grayscale - uncompressed TIFF or PNG format. PNG is usually smaller in size than other image - formats and still keeps high quality due to its employing lossless data compression - algorithms; TIFF has the advantage of the ability to contain multiple images (pages) - in a file. -
-- Several built-in functions are also provided for merging several images or PDF files - into a single one for convenient OCR operations, or for splitting a PDF file into - smaller ones if it is too large, which can cause out-of-memory exceptions. -
-
- The following code example shows common usage of the library. Make sure tessdata
- folder is populated with appropriate language data files and the .jar
- files are in the classpath. On Windows, the DLLs will be automatically extracted
- from tess4j.jar
to the default temporary directory and loaded.
-
---package net.sourceforge.tess4j.example; - -import java.io.File; -import net.sourceforge.tess4j.*; - -public class TesseractExample { - public static void main(String[] args) { - // ImageIO.scanForPlugins(); // for server environment - File imageFile = new File("eurotext.tif"); - ITesseract instance = new Tesseract(); // JNA Interface Mapping - // ITesseract instance = new Tesseract1(); // JNA Direct Mapping - // instance.setDatapath("<parentPath>"); // replace <parentPath> with path to parent directory of tessdata - // instance.setLanguage("eng"); - - try { - String result = instance.doOCR(imageFile); - System.out.println(result); - } catch (TesseractException e) { - System.err.println(e.getMessage()); - } - } -} --
- Please visit the website for the library's documentations -
-tessedit_ocr_engine_mode
.init_*()
, to indicate
- * that any of the above modes should be automatically inferred from the
- * variables in the language-specific config, command-line configs, or
- * if not specified in any of the above should be set to the default
- * OEM_TESSERACT_ONLY
.
- */
- public static final int OEM_DEFAULT = 3;
- };
-
- /**
- * Possible modes for page layout analysis. These *must* be kept in order of
- * decreasing amount of layout analysis to be done, except for
- * OSD_ONLY
, so that the inequality test macros below work.
- */
- public static interface TessPageSegMode {
-
- /**
- * Orientation and script detection only.
- */
- public static final int PSM_OSD_ONLY = 0;
- /**
- * Automatic page segmentation with orientation and script detection.
- * (OSD)
- */
- public static final int PSM_AUTO_OSD = 1;
- /**
- * Automatic page segmentation, but no OSD, or OCR.
- */
- public static final int PSM_AUTO_ONLY = 2;
- /**
- * Fully automatic page segmentation, but no OSD.
- */
- public static final int PSM_AUTO = 3;
- /**
- * Assume a single column of text of variable sizes.
- */
- public static final int PSM_SINGLE_COLUMN = 4;
- /**
- * Assume a single uniform block of vertically aligned text.
- */
- public static final int PSM_SINGLE_BLOCK_VERT_TEXT = 5;
- /**
- * Assume a single uniform block of text.
- */
- public static final int PSM_SINGLE_BLOCK = 6;
- /**
- * Treat the image as a single text line.
- */
- public static final int PSM_SINGLE_LINE = 7;
- /**
- * Treat the image as a single word.
- */
- public static final int PSM_SINGLE_WORD = 8;
- /**
- * Treat the image as a single word in a circle.
- */
- public static final int PSM_CIRCLE_WORD = 9;
- /**
- * Treat the image as a single character.
- */
- public static final int PSM_SINGLE_CHAR = 10;
- /**
- * Find as much text as possible in no particular order.
- */
- public static final int PSM_SPARSE_TEXT = 11;
- /**
- * Sparse text with orientation and script detection.
- */
- public static final int PSM_SPARSE_TEXT_OSD = 12;
- /**
- * Number of enum entries.
- */
- public static final int PSM_COUNT = 13;
- };
-
- /**
- * Enum of the elements of the page hierarchy, used in
- * ResultIterator
to provide functions that operate on each
- * level without having to have 5x as many functions.
- */
- public static interface TessPageIteratorLevel {
-
- /**
- * Block of text/image/separator line.
- */
- public static final int RIL_BLOCK = 0;
- /**
- * Paragraph within a block.
- */
- public static final int RIL_PARA = 1;
- /**
- * Line within a paragraph.
- */
- public static final int RIL_TEXTLINE = 2;
- /**
- * Word within a textline.
- */
- public static final int RIL_WORD = 3;
- /**
- * Symbol/character within a word.
- */
- public static final int RIL_SYMBOL = 4;
- };
-
- /**
- * Possible types for a POLY_BLOCK or ColPartition. Must be kept in sync
- * with kPBColors
in polyblk.cpp and PTIs*Type
- * functions below, as well as kPolyBlockNames
in
- * publictypes.cpp. Used extensively by ColPartition, and POLY_BLOCK.
- */
- public static interface TessPolyBlockType {
-
- /**
- * Type is not yet known. Keep as the first element.
- */
- public static final int PT_UNKNOWN = 0;
- /**
- * Text that lives inside a column.
- */
- public static final int PT_FLOWING_TEXT = 1;
- /**
- * Text that spans more than one column.
- */
- public static final int PT_HEADING_TEXT = 2;
- /**
- * Text that is in a cross-column pull-out region.
- */
- public static final int PT_PULLOUT_TEXT = 3;
- /**
- * Partition belonging to an equation region.
- */
- public static final int PT_EQUATION = 4;
- /**
- * Partition has inline equation.
- */
- public static final int PT_INLINE_EQUATION = 5;
- /**
- * Partition belonging to a table region.
- */
- public static final int PT_TABLE = 6;
- /**
- * Text-line runs vertically.
- */
- public static final int PT_VERTICAL_TEXT = 7;
- /**
- * Text that belongs to an image.
- */
- public static final int PT_CAPTION_TEXT = 8;
- /**
- * Image that lives inside a column.
- */
- public static final int PT_FLOWING_IMAGE = 9;
- /**
- * Image that spans more than one column.
- */
- public static final int PT_HEADING_IMAGE = 10;
- /**
- * Image that is in a cross-column pull-out region.
- */
- public static final int PT_PULLOUT_IMAGE = 11;
- /**
- * Horizontal Line.
- */
- public static final int PT_HORZ_LINE = 12;
- /**
- * Vertical Line.
- */
- public static final int PT_VERT_LINE = 13;
- /**
- * Lies outside of any column.
- */
- public static final int PT_NOISE = 14;
- /**
- * Number of enum entries.
- */
- public static final int PT_COUNT = 15;
- };
-
- /**
- * NOTA BENE: Fully justified paragraphs (text aligned to both left and
- * right margins) are marked by Tesseract with JUSTIFICATION_LEFT if their
- * text is written with a left-to-right script and with JUSTIFICATION_RIGHT
- * if their text is written in a right-to-left script.- * +------------------+ - * | 1 Aaaa Aaaa Aaaa | - * | Aaa aa aaa aa | - * | aaaaaa A aa aaa. | - * | 2 | - * | ####### c c C | - * | ####### c c c | - * | < ####### c c c | - * | < ####### c c | - * | < ####### . c | - * | 3 ####### c | - * +------------------+ - *Orientation Example: - *
WRITING_DIRECTION_LEFT_TO_RIGHT
implies
- * TEXTLINE_ORDER_TOP_TO_BOTTOM
.
- */
- public static interface TessTextlineOrder {
-
- public static final int TEXTLINE_ORDER_LEFT_TO_RIGHT = 0;
- public static final int TEXTLINE_ORDER_RIGHT_TO_LEFT = 1;
- public static final int TEXTLINE_ORDER_TOP_TO_BOTTOM = 2;
- };
-
- public static final int TRUE = 1;
- public static final int FALSE = 0;
-
- /**
- * Base class for all tesseract APIs. Specific classes can add ability to
- * work on different inputs or produce different outputs. This class is
- * mostly an interface layer on top of the Tesseract instance class to hide
- * the data types so that users of this class don't have to include any
- * other Tesseract headers.
- */
- public static class TessBaseAPI extends PointerType {
-
- public TessBaseAPI(Pointer address) {
- super(address);
- }
-
- public TessBaseAPI() {
- super();
- }
- };
-
- /**
- * Class to iterate over tesseract page structure, providing access to all
- * levels of the page hierarchy, without including any tesseract headers or
- * having to handle any tesseract structures.Init
,
- * SetImage
, Recognize
, Clear
,
- * End
DetectOS
, or anything else that changes the
- * internal PAGE_RES
. See apitypes.h
for the
- * definition of PageIteratorLevel
. See also
- * ResultIterator
, derived from PageIterator
,
- * which adds in the ability to access OCR output with text-specific
- * methods.
- */
- public static class TessPageIterator extends PointerType {
-
- public TessPageIterator(Pointer address) {
- super(address);
- }
-
- public TessPageIterator() {
- super();
- }
- };
-
- /**
- * MutableIterator adds access to internal data structures.
- */
- public static class TessMutableIterator extends PointerType {
-
- public TessMutableIterator(Pointer address) {
- super(address);
- }
-
- public TessMutableIterator() {
- super();
- }
- };
-
- /**
- * Iterator for tesseract results that is capable of iterating in proper
- * reading order over Bi Directional (e.g. mixed Hebrew and English) text.
- * ResultIterator adds text-specific methods for access to OCR output.
- */
- public static class TessResultIterator extends PointerType {
-
- public TessResultIterator(Pointer address) {
- super(address);
- }
-
- public TessResultIterator() {
- super();
- }
- };
-
- public static class TessChoiceIterator extends PointerType {
-
- public TessChoiceIterator(Pointer address) {
- super(address);
- }
-
- public TessChoiceIterator() {
- super();
- }
- };
-
- /**
- * Interface for rendering tesseract results into a document, such as text,
- * HOCR or pdf. This class is abstract. Specific classes handle individual
- * formats. This interface is then used to inject the renderer class into
- * tesseract when processing images.
- *
- * For simplicity implementing this with tesseract version 3.01, the
- * renderer contains document state that is cleared from document to
- * document just as the TessBaseAPI is. This way the base API can just
- * delegate its rendering functionality to injected renderers, and the
- * renderers can manage the associated state needed for the specific formats
- * in addition to the heuristics for producing it.
- */
- public static class TessResultRenderer extends PointerType {
-
- public TessResultRenderer(Pointer address) {
- super(address);
- }
-
- public TessResultRenderer() {
- super();
- }
- };
-
- /**
- * Description of the output of the OCR engine. This structure is used as
- * both a progress monitor and the final output header, since it needs to be
- * a valid progress monitor while the OCR engine is storing its output to
- * shared memory. During progress, all the buffer info is -1. Progress
- * starts at 0 and increases to 100 during OCR. No other constraint. Every
- * progress callback, the OCR engine must set ocr_alive
to 1.
- * The HP side will set ocr_alive
to 0. Repeated failure to
- * reset to 1 indicates that the OCR engine is dead. If the cancel function
- * is not null then it is called with the number of user words found. If it
- * returns true then operation is cancelled.
- */
- public static class ETEXT_DESC extends Structure {
-
- /**
- * chars in this buffer(0). Total number of UTF-8 bytes for this run.
- */
- public short count;
- /**
- * percent complete increasing (0-100)
- */
- public short progress;
- /**
- * true if not last
- */
- public byte more_to_come;
- /**
- * ocr sets to 1, HP 0
- */
- public byte ocr_alive;
- /**
- * for errcode use
- */
- public byte err_code;
- /**
- * returns true to cancel
- */
- public CANCEL_FUNC cancel;
- /**
- * this or other data for cancel
- */
- public Pointer cancel_this;
- /**
- * time to stop if not 0
- */
- public TimeVal end_time;
- /**
- * character data
- */
- public EANYCODE_CHAR[] text = new EANYCODE_CHAR[1];
-
- /**
- * Gets Field Order.
- *
- * @return
- */
- @Override
- protected List getFieldOrder() {
- return Arrays.asList("count", "progress", "more_to_come", "ocr_alive", "err_code", "cancel", "cancel_this", "end_time", "text");
- }
- }
-
- /**
- * It should be noted that the format for char_code for version 2.0 and
- * beyond is UTF-8, which means that ASCII characters will come out as one
- * structure but other characters will be returned in two or more instances
- * of this structure with a single byte of the UTF-8 code in each, but each
- * will have the same bounding box.cancel_func
.
- */
- interface CANCEL_FUNC extends Callback {
-
- /**
- *
- * @param cancel_this
- * @param words
- * @return
- */
- boolean invoke(Pointer cancel_this, int words);
- };
-
- public static class TimeVal extends Structure {
-
- /**
- * seconds
- */
- public NativeLong tv_sec;
- /**
- * microseconds
- */
- public NativeLong tv_usec;
-
- @Override
- protected Listnull
indicates
- * the whole image.
- * @return the recognized text
- * @throws TesseractException
- */
- String doOCR(File imageFile, Rectangle rect) throws TesseractException;
-
- /**
- * Performs OCR operation.
- *
- * @param bi a buffered image
- * @return the recognized text
- * @throws TesseractException
- */
- String doOCR(BufferedImage bi) throws TesseractException;
-
- /**
- * Performs OCR operation.
- *
- * @param bi a buffered image
- * @param rect the bounding rectangle defines the region of the image to be
- * recognized. A rectangle of zero dimension or null
indicates
- * the whole image.
- * @return the recognized text
- * @throws TesseractException
- */
- String doOCR(BufferedImage bi, Rectangle rect) throws TesseractException;
-
- /**
- * Performs OCR operation.
- *
- * @param imageList a list of IIOImage
objects
- * @param rect the bounding rectangle defines the region of the image to be
- * recognized. A rectangle of zero dimension or null
indicates
- * the whole image.
- * @return the recognized text
- * @throws TesseractException
- */
- String doOCR(ListIIOImage
objects
- * @param filename input file name. Needed only for training and reading a
- * UNLV zone file.
- * @param rect the bounding rectangle defines the region of the image to be
- * recognized. A rectangle of zero dimension or null
indicates
- * the whole image.
- * @return the recognized text
- * @throws TesseractException
- */
- String doOCR(ListSetImage
, (optionally)
- * SetRectangle
, and one or more of the Get*Text
- * functions.
- *
- * @param xsize width of image
- * @param ysize height of image
- * @param buf pixel data
- * @param rect the bounding rectangle defines the region of the image to be
- * recognized. A rectangle of zero dimension or null
indicates
- * the whole image.
- * @param bpp bits per pixel, represents the bit depth of the image, with 1
- * for binary bitmap, 8 for gray, and 24 for color RGB.
- * @return the recognized text
- * @throws TesseractException
- */
- String doOCR(int xsize, int ysize, ByteBuffer buf, Rectangle rect, int bpp) throws TesseractException;
-
- /**
- * Performs OCR operation. Use SetImage
, (optionally)
- * SetRectangle
, and one or more of the Get*Text
- * functions.
- *
- * @param xsize width of image
- * @param ysize height of image
- * @param buf pixel data
- * @param filename input file name. Needed only for training and reading a
- * UNLV zone file.
- * @param rect the bounding rectangle defines the region of the image to be
- * recognized. A rectangle of zero dimension or null
indicates
- * the whole image.
- * @param bpp bits per pixel, represents the bit depth of the image, with 1
- * for binary bitmap, 8 for gray, and 24 for color RGB.
- * @return the recognized text
- * @throws TesseractException
- */
- String doOCR(int xsize, int ysize, ByteBuffer buf, String filename, Rectangle rect, int bpp) throws TesseractException;
-
- /**
- * Sets tessdata path.
- *
- * @param datapath the tessdata path to set
- */
- void setDatapath(String datapath);
-
- /**
- * Sets language for OCR.
- *
- * @param language the language code, which follows ISO 639-3 standard.
- */
- void setLanguage(String language);
-
- /**
- * Sets OCR engine mode.
- *
- * @param ocrEngineMode the OcrEngineMode to set
- */
- void setOcrEngineMode(int ocrEngineMode);
-
- /**
- * Sets page segmentation mode.
- *
- * @param mode the page segmentation mode to set
- */
- void setPageSegMode(int mode);
-
- /**
- * Sets the value of Tesseract's internal parameter.
- *
- * @param key variable name, e.g., tessedit_create_hocr
,
- * tessedit_char_whitelist
, etc.
- * @param value value for corresponding variable, e.g., "1", "0",
- * "0123456789", etc.
- */
- void setTessVariable(String key, String value);
-
- /**
- * Sets configs to be passed to Tesseract's Init
method.
- *
- * @param configs list of config filenames, e.g., "digits", "bazaar",
- * "quiet"
- */
- void setConfigs(ListRectangle
- * @throws TesseractException
- */
- ListWord
- */
- ListTesseract OCR 3.04 API
using
- * JNA Interface Mapping
.
- */
-public interface TessAPI extends Library, ITessAPI {
-
- /**
- * An instance of the class library.
- */
- public static final TessAPI INSTANCE = LoadLibs.getTessAPIInstance();
-
- /**
- * Gets the version identifier.
- *
- * @return the version identifier
- */
- String TessVersion();
-
- /**
- * Deallocates the memory block occupied by text.
- *
- * @param text the pointer to text
- */
- void TessDeleteText(Pointer text);
-
- /**
- * Deallocates the memory block occupied by text array.
- *
- * @param arr text array pointer reference
- */
- void TessDeleteTextArray(PointerByReference arr);
-
- /**
- * Deallocates the memory block occupied by integer array.
- *
- * @param arr int array
- */
- void TessDeleteIntArray(IntBuffer arr);
-
- /* Renderer API */
- TessResultRenderer TessTextRendererCreate(String outputbase);
-
- TessResultRenderer TessHOcrRendererCreate(String outputbase);
-
- TessResultRenderer TessHOcrRendererCreate2(String outputbase, int font_info);
-
- TessResultRenderer TessPDFRendererCreate(String outputbase, String datadir);
-
- TessResultRenderer TessPDFRendererCreateTextonly(String outputbase, String datadir, int textonly);
-
- TessResultRenderer TessUnlvRendererCreate(String outputbase);
-
- TessResultRenderer TessBoxTextRendererCreate(String outputbase);
-
- void TessDeleteResultRenderer(TessResultRenderer renderer);
-
- void TessResultRendererInsert(TessResultRenderer renderer, TessResultRenderer next);
-
- TessResultRenderer TessResultRendererNext(TessResultRenderer renderer);
-
- int TessResultRendererBeginDocument(TessResultRenderer renderer, String title);
-
- int TessResultRendererAddImage(TessResultRenderer renderer, PointerByReference api);
-
- int TessResultRendererEndDocument(TessResultRenderer renderer);
-
- Pointer TessResultRendererExtention(TessResultRenderer renderer);
-
- Pointer TessResultRendererTitle(TessResultRenderer renderer);
-
- int TessResultRendererImageNum(TessResultRenderer renderer);
-
- /**
- * Creates an instance of the base class for all Tesseract APIs.
- *
- * @return the TesseractAPI instance
- */
- TessBaseAPI TessBaseAPICreate();
-
- /**
- * Disposes the TesseractAPI instance.
- *
- * @param handle the TesseractAPI instance
- */
- void TessBaseAPIDelete(TessBaseAPI handle);
-
- /**
- * Set the name of the input file. Needed only for training and reading a
- * UNLV zone file, and for searchable PDF output.
- *
- * @param handle the TesseractAPI instance
- * @param name name of the input file
- */
- void TessBaseAPISetInputName(TessBaseAPI handle, String name);
-
- /**
- * These functions are required for searchable PDF output. We need our hands
- * on the input file so that we can include it in the PDF without
- * transcoding. If that is not possible, we need the original image.
- * Finally, resolution metadata is stored in the PDF so we need that as
- * well.
- *
- * @param handle the TesseractAPI instance
- * @return input file name
- */
- String TessBaseAPIGetInputName(TessBaseAPI handle);
-
- void TessBaseAPISetInputImage(TessBaseAPI handle, Pix pix);
-
- Pix TessBaseAPIGetInputImage(TessBaseAPI handle);
-
- int TessBaseAPIGetSourceYResolution(TessBaseAPI handle);
-
- String TessBaseAPIGetDatapath(TessBaseAPI handle);
-
- /**
- * Set the name of the bonus output files. Needed only for debugging.
- *
- * @param handle the TesseractAPI instance
- * @param name name of the output file
- */
- void TessBaseAPISetOutputName(TessBaseAPI handle, String name);
-
- /**
- * Set the value of an internal "parameter." Supply the name of the
- * parameter and the value as a string, just as you would in a config file.
- * Returns false if the name lookup failed. E.g.,
- * SetVariable("tessedit_char_blacklist", "xyz");
to ignore x,
- * y and z. Or SetVariable("classify_bln_numeric_mode", "1");
- * to set numeric-only mode. SetVariable
may be used before
- * Init
, but settings will revert to defaults on
- * End()
.Init()
. Only works for non-init
- * variables (init variables should be passed to Init()
).
- *
- *
- * @param handle the TesseractAPI instance
- * @param name name of the input
- * @param value variable value
- * @return 1 on success
- */
- int TessBaseAPISetVariable(TessBaseAPI handle, String name, String value);
-
- /**
- * Get the value of an internal int parameter.
- *
- * @param handle the TesseractAPI instance
- * @param name name of the input
- * @param value pass the int buffer value
- * @return 1 on success
- */
- int TessBaseAPIGetIntVariable(TessBaseAPI handle, String name, IntBuffer value);
-
- /**
- * Get the value of an internal bool parameter.
- *
- * @param handle the TesseractAPI instance
- * @param name pass the name of the variable
- * @param value pass the int buffer value
- * @return 1 on success
- */
- int TessBaseAPIGetBoolVariable(TessBaseAPI handle, String name, IntBuffer value);
-
- /**
- * Get the value of an internal double parameter.
- *
- * @param handle the TesseractAPI instance
- * @param name pass the name of the variable
- * @param value pass the double buffer value
- * @return 1 on success
- */
- int TessBaseAPIGetDoubleVariable(TessBaseAPI handle, String name, DoubleBuffer value);
-
- /**
- * Get the value of an internal string parameter.
- *
- * @param handle the TesseractAPI instance
- * @param name pass the name of the variable
- * @return the string value
- */
- String TessBaseAPIGetStringVariable(TessBaseAPI handle, String name);
-
- /**
- * Print Tesseract parameters to the given file.SetVariable
on some of the Params in classify and textord.
- * If you do, then the effect will be to change it for all your
- * instances.Init
are those listed
- * above here in the class definition.Init
multiple times on the same instance to change language,
- * or just to reset the classifier. Languages may specify internally that
- * they want to be loaded with one or more other languages, so the ~
- * sign is available to override that. E.g., if hin
were set to
- * load eng
by default, then hin+~eng
would force
- * loading only hin
. The number of loaded languages is limited
- * only by memory, with the caveat that loading additional languages will
- * impact both speed and accuracy, as there is more work to do to decide on
- * the applicable language, and there is more chance of hallucinating
- * incorrect words. WARNING: On changing languages, all Tesseract parameters
- * are reset back to their default values. (Which may vary between
- * languages.) If you have a rare need to set a Variable that controls
- * initialization for a second call to Init
you should
- * explicitly call End()
and then use SetVariable
- * before Init
.Init
.set_only_non_debug_params
is true, only params that do
- * not contain "debug" in the name will be set.
- *
- * @param handle the TesseractAPI instance
- * @param datapath The datapath
must be the name of the parent
- * directory of tessdata
and must end in
- * /. Any name after the last / will be stripped.
- * @param language The language is (usually) an ISO 639-3
- * string or NULL
will default to eng
. The
- * language may be a string of the form [~]<lang>[+[~]<lang>]
- * indicating that multiple languages are to be loaded. E.g.,
- * hin+eng
will load Hindi and English.
- * @param oem ocr engine mode
- * @param configs pointer configuration
- * @param configs_size pointer configuration size
- * @return 0 on success and -1 on initialization failure
- */
- int TessBaseAPIInit1(TessBaseAPI handle, String datapath, String language, int oem,
- PointerByReference configs, int configs_size);
-
- /**
- * @param handle the TesseractAPI instance
- * @param datapath The datapath
must be the name of the parent
- * directory of tessdata
and must end in
- * /. Any name after the last / will be stripped.
- * @param language The language is (usually) an ISO 639-3
- * string or NULL
will default to eng
. The
- * language may be a string of the form [~]<lang>[+[~]<lang>]
- * indicating that multiple languages are to be loaded. E.g.,
- * hin+eng
will load Hindi and English.
- * @param oem ocr engine mode
- * @return 0 on success and -1 on initialization failure
- */
- int TessBaseAPIInit2(TessBaseAPI handle, String datapath, String language, int oem);
-
- /**
- * @param handle the TesseractAPI instance
- * @param datapath The datapath
must be the name of the parent
- * directory of tessdata
and must end in
- * /. Any name after the last / will be stripped.
- * @param language The language is (usually) an ISO 639-3
- * string or NULL
will default to eng
. The
- * language may be a string of the form [~]<lang>[+[~]<lang>]
- * indicating that multiple languages are to be loaded. E.g.,
- * hin+eng
will load Hindi and English.
- * @return 0 on success and -1 on initialization failure
- */
- int TessBaseAPIInit3(TessBaseAPI handle, String datapath, String language);
-
- /**
- *
- * @param handle the TesseractAPI instance
- * @param datapath The datapath
must be the name of the parent
- * directory of tessdata
and must end in
- * /. Any name after the last / will be stripped.
- * @param language The language is (usually) an ISO 639-3
- * string or NULL
will default to eng
. The
- * language may be a string of the form [~]<lang>[+[~]<lang>]
- * indicating that multiple languages are to be loaded. E.g.,
- * hin+eng
will load Hindi and English.
- * @param oem ocr engine mode
- * @param configs pointer configuration
- * @param configs_size pointer configuration size
- * @param vars_vec
- * @param vars_values
- * @param vars_vec_size
- * @param set_only_non_debug_params
- * @return 0 on success and -1 on initialization failure
- */
- int TessBaseAPIInit4(TessBaseAPI handle, String datapath, String language, int oem, PointerByReference configs, int configs_size, PointerByReference vars_vec, PointerByReference vars_values, NativeSize vars_vec_size, int set_only_non_debug_params);
-
- /**
- * Returns the languages string used in the last valid initialization. If
- * the last initialization specified "deu+hin" then that will be returned.
- * If hin
loaded eng
automatically as well, then
- * that will not be included in this list. To find the languages actually
- * loaded, use GetLoadedLanguagesAsVector
. The returned string
- * should NOT be deleted.
- *
- * @param handle the TesseractAPI instance
- * @return languages as string
- */
- String TessBaseAPIGetInitLanguagesAsString(TessBaseAPI handle);
-
- /**
- * Returns the loaded languages in the vector of STRINGs. Includes all
- * languages loaded by the last Init
, including those loaded as
- * dependencies of other loaded languages.
- *
- * @param handle the TesseractAPI instance
- * @return loaded languages as vector
- */
- PointerByReference TessBaseAPIGetLoadedLanguagesAsVector(TessBaseAPI handle);
-
- /**
- * Returns the available languages in the vector of STRINGs.
- *
- * @param handle the TesseractAPI instance
- * @return available languages as vector
- */
- PointerByReference TessBaseAPIGetAvailableLanguagesAsVector(TessBaseAPI handle);
-
- /**
- * Init only the lang model component of Tesseract. The only functions that
- * work after this init are SetVariable
and
- * IsValidWord
. WARNING: temporary! This function will be
- * removed from here and placed in a separate API at some future time.
- *
- * @param handle the TesseractAPI instance
- * @param datapath The datapath
must be the name of the parent
- * directory of tessdata
and must end in
- * /. Any name after the last / will be stripped.
- * @param language The language is (usually) an ISO 639-3
- * string or NULL
will default to eng. The language may be a
- * string of the form [~]<lang>[+[~]<lang>] indicating that
- * multiple languages are to be loaded. E.g., hin+eng will load Hindi and
- * English.
- * @return api init language mode
- */
- int TessBaseAPIInitLangMod(TessBaseAPI handle, String datapath, String language);
-
- /**
- * Init only for page layout analysis. Use only for calls to
- * SetImage
and AnalysePage
. Calls that attempt
- * recognition will generate an error.
- *
- * @param handle the TesseractAPI instance
- */
- void TessBaseAPIInitForAnalysePage(TessBaseAPI handle);
-
- /**
- * Read a "config" file containing a set of param, value pairs. Searches the
- * standard places: tessdata/configs
,
- * tessdata/tessconfigs
and also accepts a relative or absolute
- * path name. Note: only non-init params will be set (init params are set by
- * Init()
).
- *
- *
- * @param handle the TesseractAPI instance
- * @param filename relative or absolute path for the "config" file
- * containing a set of param and value pairs
- * @param init_only
- */
- void TessBaseAPIReadConfigFile(TessBaseAPI handle, String filename, int init_only);
-
- /**
- * Set the current page segmentation mode. Defaults to
- * PSM_SINGLE_BLOCK
. The mode is stored as an IntParam so it
- * can also be modified by ReadConfigFile
or
- * SetVariable("tessedit_pageseg_mode", mode as string)
.
- *
- * @param handle the TesseractAPI instance
- * @param mode tesseract page segment mode
- */
- void TessBaseAPISetPageSegMode(TessBaseAPI handle, int mode);
-
- /**
- * Return the current page segmentation mode.
- *
- * @param handle the TesseractAPI instance
- * @return page segment mode value
- */
- int TessBaseAPIGetPageSegMode(TessBaseAPI handle);
-
- /**
- * Recognize a rectangle from an image and return the result as a string.
- * May be called many times for a single Init
. Currently has no
- * error checking. Greyscale of 8 and color of 24 or 32 bits per pixel may
- * be given. Palette color images will not work properly and must be
- * converted to 24 bit. Binary images of 1 bit per pixel may also be given
- * but they must be byte packed with the MSB of the first byte being the
- * first pixel, and a 1 represents WHITE. For binary images set
- * bytes_per_pixel=0. The recognized text is returned as a char* which is
- * coded as UTF8 and must be freed with the delete [] operator.TesseractRect
is the simplified convenience
- * interface. For advanced uses, use SetImage
, (optionally)
- * SetRectangle
, Recognize
, and one or more of the
- * Get*Text
functions below.
- *
- * @param handle the TesseractAPI instance
- * @param imagedata image byte buffer
- * @param bytes_per_pixel bytes per pixel
- * @param bytes_per_line bytes per line
- * @param left image left
- * @param top image top
- * @param width image width
- * @param height image height
- * @return the pointer to recognized text
- */
- Pointer TessBaseAPIRect(TessBaseAPI handle, ByteBuffer imagedata, int bytes_per_pixel, int bytes_per_line,
- int left, int top, int width, int height);
-
- /**
- * Call between pages or documents etc to free up memory and forget adaptive
- * data.
- *
- * @param handle the TesseractAPI instance
- */
- void TessBaseAPIClearAdaptiveClassifier(TessBaseAPI handle);
-
- /**
- * Provide an image for Tesseract to recognize. Format is as
- * TesseractRect
above. Does not copy the image buffer, or take
- * ownership. The source image may be destroyed after Recognize
is called,
- * either explicitly or implicitly via one of the Get*Text
- * functions. SetImage
clears all recognition results, and sets
- * the rectangle to the full image, so it may be followed immediately by a
- * GetUTF8Text
, and it will automatically perform recognition.
- *
- * @param handle the TesseractAPI instance
- * @param imagedata image byte buffer
- * @param width image width
- * @param height image height
- * @param bytes_per_pixel bytes per pixel
- * @param bytes_per_line bytes per line
- */
- void TessBaseAPISetImage(TessBaseAPI handle, ByteBuffer imagedata, int width, int height,
- int bytes_per_pixel, int bytes_per_line);
-
- /**
- * Provide an image for Tesseract to recognize. As with
- * SetImage
above, Tesseract doesn't take a copy or ownership
- * or pixDestroy
the image, so it must persist until after
- * Recognize
. Pix
vs raw, which to use? Use
- * Pix
where possible. A future version of Tesseract may choose
- * to use Pix
as its internal representation and discard
- * IMAGE
altogether. Because of that, an implementation that
- * sources and targets Pix
may end up with less copies than an
- * implementation that does not.
- *
- * @param handle the TesseractAPI instance
- * @param pix image
- */
- void TessBaseAPISetImage2(TessBaseAPI handle, Pix pix);
-
- /**
- * Set the resolution of the source image in pixels per inch so font size
- * information can be calculated in results. Call this after
- * SetImage()
.
- *
- * @param handle the TesseractAPI instance
- * @param ppi source resolution value
- */
- void TessBaseAPISetSourceResolution(TessBaseAPI handle, int ppi);
-
- /**
- * Restrict recognition to a sub-rectangle of the image. Call after
- * SetImage
. Each SetRectangle
clears the
- * recognition results so multiple rectangles can be recognized with the
- * same image.
- *
- * @param handle the TesseractAPI instance
- * @param left value
- * @param top value
- * @param width value
- * @param height value
- */
- void TessBaseAPISetRectangle(TessBaseAPI handle, int left, int top, int width, int height);
-
- /**
- * ONLY available after SetImage
if you have Leptonica
- * installed. Get a copy of the internal thresholded image from Tesseract.
- *
- * @param handle the TesseractAPI instance
- * @return internal thresholded image
- */
- Pix TessBaseAPIGetThresholdedImage(TessBaseAPI handle);
-
- /**
- * Get the result of page layout analysis as a Leptonica-style
- * Boxa
, Pixa
pair, in reading order. Can be
- * called before or after Recognize
.
- *
- * @param handle the TesseractAPI instance
- * @param pixa array of Pix
- * @return array of Box
- */
- Boxa TessBaseAPIGetRegions(TessBaseAPI handle, PointerByReference pixa);
-
- /**
- * Get the textlines as a Leptonica-style Boxa
,
- * Pixa
pair, in reading order. Can be called before or after
- * Recognize
. If blockids
is not NULL
, the
- * block-id of each line is also returned as an array of one element per
- * line. delete [] after use. If paraids
is not
- * NULL
, the paragraph-id of each line within its block is also
- * returned as an array of one element per line. delete [] after use.Boxa
,
- * Pixa
pair, in reading order. Can be called before or after
- * Recognize
. If blockids
is not NULL
, the
- * block-id of each line is also returned as an array of one element per
- * line. delete [] after use. If paraids
is not
- * NULL
, the paragraph-id of each line within its block is also
- * returned as an array of one element per line. delete [] after use.
- *
- * @param handle the TesseractAPI instance
- * @param raw_image
- * @param raw_padding
- * @param pixa array of Pix
- * @param blockids
- * @param paraids
- * @return array of Box
- */
- Boxa TessBaseAPIGetTextlines1(TessBaseAPI handle, int raw_image, int raw_padding, PointerByReference pixa, PointerByReference blockids, PointerByReference paraids);
-
- /**
- * Get textlines and strips of image regions as a Leptonica-style
- * Boxa
, Pixa
pair, in reading order. Enables
- * downstream handling of non-rectangular regions. Can be called before or
- * after Recognize
. If blockids
is not NULL, the block-id of
- * each line is also returned as an array of one element per line. delete []
- * after use.
- *
- * @param handle the TesseractAPI instance
- * @param pixa array of Pix
- * @param blockids
- * @return array of Box
- */
- Boxa TessBaseAPIGetStrips(TessBaseAPI handle, PointerByReference pixa, PointerByReference blockids);
-
- /**
- * Get the words as a Leptonica-style Boxa
, Pixa
- * pair, in reading order. Can be called before or after
- * Recognize
.
- *
- * @param handle the TesseractAPI instance
- * @param pixa array of Pix
- * @return array of Box
- */
- Boxa TessBaseAPIGetWords(TessBaseAPI handle, PointerByReference pixa);
-
- /**
- * Gets the individual connected (text) components (created after pages
- * segmentation step, but before recognition) as a Leptonica-style
- * Boxa
, Pixa
pair, in reading order. Can be
- * called before or after Recognize
.
- *
- * @param handle the TesseractAPI instance
- * @param cc array of Pix
- * @return array of Box
- */
- Boxa TessBaseAPIGetConnectedComponents(TessBaseAPI handle, PointerByReference cc);
-
- /**
- * Get the given level kind of components (block, textline, word etc.) as a
- * Leptonica-style Boxa
, Pixa
pair, in reading
- * order. Can be called before or after Recognize
. If blockids
- * is not NULL
, the block-id of each component is also returned
- * as an array of one element per component. delete [] after use. If
- * text_only
is true, then only text components are returned.
- * Helper function to get binary images with no padding (most common usage).
- *
- * @param handle the TesseractAPI instance
- * @param level PageIteratorLevel
- * @param text_only
- * @param pixa array of Pix
- * @param blockids
- * @return array of Box
- */
- Boxa TessBaseAPIGetComponentImages(TessBaseAPI handle, int level, int text_only, PointerByReference pixa, PointerByReference blockids);
-
- /**
- * Get the given level kind of components (block, textline, word etc.) as a
- * Leptonica-style Boxa
, Pixa
pair, in reading
- * order. Can be called before or after Recognize
. If blockids
- * is not NULL
, the block-id of each component is also returned
- * as an array of one element per component. delete [] after use. If
- * paraids
is not NULL
, the paragraph-id of each
- * component with its block is also returned as an array of one element per
- * component. delete [] after use. If raw_image
is true, then
- * portions of the original image are extracted instead of the thresholded
- * image and padded with raw_padding. If text_only
is true,
- * then only text components are returned.
- *
- * @param handle the TesseractAPI instance
- * @param level PageIteratorLevel
- * @param text_only
- * @param raw_image
- * @param raw_padding
- * @param pixa array of Pix
- * @param blockids
- * @param paraids
- * @return
- */
- Boxa TessBaseAPIGetComponentImages1(TessBaseAPI handle, int level, int text_only, int raw_image, int raw_padding, PointerByReference pixa, PointerByReference blockids, PointerByReference paraids);
-
- /**
- * @param handle the TesseractAPI instance
- * @return Scale factor from original image.
- */
- int TessBaseAPIGetThresholdedImageScaleFactor(TessBaseAPI handle);
-
- /**
- * Dump the internal binary image to a PGM file.
- *
- * @param handle the TesseractAPI instance
- * @param filename pgm file name
- */
- void TessBaseAPIDumpPGM(TessBaseAPI handle, String filename);
-
- /**
- * Runs page layout analysis in the mode set by SetPageSegMode
.
- * May optionally be called prior to Recognize
to get access to
- * just the page layout results. Returns an iterator to the results. Returns
- * NULL
on error. The returned iterator must be deleted after
- * use. WARNING! This class points to data held within the
- * TessBaseAPI
class, and therefore can only be used while the
- * TessBaseAPI
class still exists and has not been subjected to
- * a call of Init
, SetImage
,
- * Recognize
, Clear
, End
, DetectOS,
- * or anything else that changes the internal PAGE_RES
.
- *
- * @param handle the TesseractAPI instance
- * @return returns an iterator to the results. Returns NULL on error. The
- * returned iterator must be deleted after use.
- */
- TessPageIterator TessBaseAPIAnalyseLayout(TessBaseAPI handle);
-
- /**
- * Recognize the image from SetAndThresholdImage
, generating
- * Tesseract internal structures. Returns 0 on success. Optional. The
- * Get*Text
functions below will call Recognize
if
- * needed. After Recognize
, the output is kept internally until
- * the next SetImage
.
- *
- * @param handle the TesseractAPI instance
- * @param monitor the result as Tesseract internal structures
- * @return 0 on success
- */
- int TessBaseAPIRecognize(TessBaseAPI handle, ETEXT_DESC monitor);
-
- /**
- * Variant on Recognize
used for testing chopper.
- *
- * @param handle the TesseractAPI instance
- * @param monitor the result as Tesseract internal structures
- * @return 0 on success
- */
- int TessBaseAPIRecognizeForChopTest(TessBaseAPI handle, ETEXT_DESC monitor);
-
- /**
- * Get a reading-order iterator to the results of LayoutAnalysis and/or
- * Recognize
. The returned iterator must be deleted after use.
- * WARNING! This class points to data held within the
- * TessBaseAPI
class, and therefore can only be used while the
- * TessBaseAPI
class still exists and has not been subjected to
- * a call of Init
, SetImage
,
- * Recognize
, Clear
, End
, DetectOS,
- * or anything else that changes the internal PAGE_RES.
- *
- * @param handle the TesseractAPI instance
- * @return the result iterator
- */
- TessResultIterator TessBaseAPIGetIterator(TessBaseAPI handle);
-
- /**
- * Get a mutable iterator to the results of LayoutAnalysis and/or
- * Recognize
. The returned iterator must be deleted after use.
- * WARNING! This class points to data held within the
- * TessBaseAPI
class, and therefore can only be used while the
- * TessBaseAPI
class still exists and has not been subjected to
- * a call of Init
, SetImage
,
- * Recognize
, Clear
, End
, DetectOS,
- * or anything else that changes the internal PAGE_RES
.
- *
- * @param handle the TesseractAPI instance
- * @return the mutable iterator
- */
- TessMutableIterator TessBaseAPIGetMutableIterator(TessBaseAPI handle);
-
- /**
- * Recognizes all the pages in the named file, as a multi-page tiff or list
- * of filenames, or single image, and gets the appropriate kind of text
- * according to parameters: tessedit_create_boxfile
,
- * tessedit_make_boxes_from_boxes
,
- * tessedit_write_unlv
, tessedit_create_hocr
.
- * Calls ProcessPage on each page in the input file, which may be a
- * multi-page tiff, single-page other file format, or a plain text list of
- * images to read. If tessedit_page_number is non-negative, processing
- * begins at that page of a multi-page tiff file, or filelist. The text is
- * returned in text_out. Returns false on error. If non-zero
- * timeout_millisec terminates processing after the timeout on a single
- * page. If non-NULL and non-empty, and some page fails for some reason, the
- * page is reprocessed with the retry_config config file. Useful for
- * interactively debugging a bad page.
- *
- * @param handle the TesseractAPI instance
- * @param filename multi-page tiff or list of filenames
- * @param retry_config retry config values
- * @param timeout_millisec timeout value
- * @param renderer result renderer
- * @return the status
- */
- int TessBaseAPIProcessPages(TessBaseAPI handle, String filename, String retry_config, int timeout_millisec, TessResultRenderer renderer);
-
- int TessBaseAPIProcessPage(TessBaseAPI handle, Pix pix, int page_index, String filename, String retry_config, int timeout_millisec, TessResultRenderer renderer);
-
- /**
- * The recognized text is returned as a char* which is coded as UTF-8 and
- * must be freed with the delete [] operator.
- *
- * @param handle the TesseractAPI instance
- * @return the pointer to output text
- */
- Pointer TessBaseAPIGetUTF8Text(TessBaseAPI handle);
-
- /**
- * Make a HTML-formatted string with hOCR markup from the internal data
- * structures. page_number is 0-based but will appear in the output as
- * 1-based.
- *
- * @param handle the TesseractAPI instance
- * @param page_number page number
- * @return the pointer to hOCR text
- */
- Pointer TessBaseAPIGetHOCRText(TessBaseAPI handle, int page_number);
-
- /**
- * The recognized text is returned as a char* which is coded as a UTF8 box
- * file and must be freed with the delete [] operator. page_number is a
- * 0-base page index that will appear in the box file.
- *
- * @param handle the TesseractAPI instance
- * @param page_number number of the page
- * @return the pointer to box text
- */
- Pointer TessBaseAPIGetBoxText(TessBaseAPI handle, int page_number);
-
- /**
- * The recognized text is returned as a char* which is coded as UNLV format
- * Latin-1 with specific reject and suspect codes and must be freed with the
- * delete [] operator.
- *
- * @param handle the TesseractAPI instance
- * @return the pointer to UNLV text
- */
- Pointer TessBaseAPIGetUNLVText(TessBaseAPI handle);
-
- /**
- * Returns the average word confidence for Tesseract page result.
- *
- * @param handle the TesseractAPI instance
- * @return the (average) confidence value between 0 and 100.
- */
- int TessBaseAPIMeanTextConf(TessBaseAPI handle);
-
- /**
- * Returns an array of all word confidences, terminated by -1. The calling
- * function must delete [] after use. The number of confidences should
- * correspond to the number of space-delimited words in
- * GetUTF8Text
.
- *
- * @param handle the TesseractAPI instance
- * @return all word confidences (between 0 and 100) in an array, terminated
- * by -1
- */
- IntByReference TessBaseAPIAllWordConfidences(TessBaseAPI handle);
-
- /**
- * Applies the given word to the adaptive classifier if possible. The word
- * must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can tell the
- * boundaries of the graphemes. Assumes that
- * SetImage
/SetRectangle
have been used to set the
- * image to the given word. The mode arg should be
- * PSM_SINGLE_WORD
or PSM_CIRCLE_WORD
, as that
- * will be used to control layout analysis. The currently set PageSegMode is
- * preserved.
- *
- * @param handle the TesseractAPI instance
- * @param mode tesseract page segment mode
- * @param wordstr The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s ,
- * so it can tell the boundaries of the graphemes.
- * @return false if adaption was not possible for some reason.
- */
- int TessBaseAPIAdaptToWordStr(TessBaseAPI handle, int mode, String wordstr);
-
- /**
- * Free up recognition results and any stored image data, without actually
- * freeing any recognition data that would be time-consuming to reload.
- * Afterwards, you must call SetImage
or
- * TesseractRect
before doing any Recognize
or
- * Get*
operation.
- *
- * @param handle the TesseractAPI instance
- */
- void TessBaseAPIClear(TessBaseAPI handle);
-
- /**
- * Close down tesseract and free up all memory. End()
is
- * equivalent to destructing and reconstructing your TessBaseAPI. Once
- * End()
has been used, none of the other API functions may be
- * used other than Init
and anything declared above it in the
- * class definition.
- *
- * @param handle the TesseractAPI instance
- */
- void TessBaseAPIEnd(TessBaseAPI handle);
-
- /**
- * Check whether a word is valid according to Tesseract's language model.
- *
- * @param handle the TesseractAPI instance
- * @param word word value
- * @return 0 if the word is invalid, non-zero if valid
- */
- int TessBaseAPIIsValidWord(TessBaseAPI handle, String word);
-
- /**
- * Gets text direction.
- *
- * @param handle the TesseractAPI instance
- * @param out_offset offset
- * @param out_slope slope
- * @return TRUE if text direction is valid
- */
- int TessBaseAPIGetTextDirection(TessBaseAPI handle, IntBuffer out_offset, FloatBuffer out_slope);
-
- /**
- * Clear any library-level memory caches. There are a variety of
- * expensive-to-load constant data structures (mostly language dictionaries)
- * that are cached globally -- surviving the Init()
and
- * End()
of individual TessBaseAPI's. This function allows the
- * clearing of these caches.
- *
- * @param handle the TesseractAPI instance
- */
- void TessBaseAPIClearPersistentCache(TessBaseAPI handle);
-
- /**
- * Detect the orientation of the input image and apparent script (alphabet).
- * orient_deg
is the detected clockwise rotation of the input image in
- * degrees (0, 90, 180, 270); orient_conf
is the confidence (15.0 is
- * reasonably confident); script_name
is an ASCII string, the name of the
- * script, e.g. "Latin"; script_conf
is confidence level in the script.
- *
- * @return TRUE on success and writes values to each parameter as an output
- */
- int TessBaseAPIDetectOrientationScript(TessBaseAPI handle, IntBuffer orient_deg, FloatBuffer orient_conf, PointerByReference script_name, FloatBuffer script_conf);
-
- /**
- * Gets the string of the specified unichar.
- *
- * @param handle the TesseractAPI instance
- * @param unichar_id the unichar id
- * @return the string form of the specified unichar.
- */
- String TessBaseAPIGetUnichar(TessBaseAPI handle, int unichar_id);
-
- /**
- * Deletes the specified PageIterator instance.
- *
- * @param handle the TessPageIterator instance
- */
- void TessPageIteratorDelete(TessPageIterator handle);
-
- /**
- * Creates a copy of the specified PageIterator instance.
- *
- * @param handle the TessPageIterator instance
- * @return page iterator copy
- */
- TessPageIterator TessPageIteratorCopy(TessPageIterator handle);
-
- /**
- * Resets the iterator to point to the start of the page.
- *
- * @param handle the TessPageIterator instance
- */
- void TessPageIteratorBegin(TessPageIterator handle);
-
- /**
- * Moves to the start of the next object at the given level in the page
- * hierarchy, and returns false if the end of the page was reached. NOTE
- * (CHANGED!) that ALL PageIteratorLevel level values will visit each
- * non-text block at least once.PTIsTextType(BLockType())
is false for non-text blocks.pixDestroy
to delete the image after use. The following
- * methods are used to generate the images: RIL_BLOCK
: mask the
- * page image with the block polygon. RIL_TEXTLINE
: Clip the
- * rectangle of the line box from the page image. TODO(rays) fix this to
- * generate and use a line polygon. RIL_WORD
: Clip the
- * rectangle of the word box from the page image. RIL_SYMBOL
:
- * Render the symbol outline to an image for cblobs (prior to recognition)
- * or the bounding box otherwise. A reconstruction of the original image
- * (using xor to check for double representation) should be reasonably
- * accurate, apart from removed noise, at the block level. Below the block
- * level, the reconstruction will be missing images and line separators. At
- * the symbol level, kerned characters will be invade the bounding box if
- * rendered after recognition, making an xor reconstruction inaccurate, but
- * an or construction better. Before recognition, symbol-level
- * reconstruction should be good, even with xor, since the images come from
- * the connected components.
- *
- * @param handle the TessPageIterator instance
- * @param level PageIteratorLevel
- * @return
- */
- Pix TessPageIteratorGetBinaryImage(TessPageIterator handle, int level);
-
- /**
- * Returns an image of the current object at the given level in greyscale if
- * available in the input. To guarantee a binary image use BinaryImage. NOTE
- * that in order to give the best possible image, the bounds are expanded
- * slightly over the binary connected component, by the supplied padding, so
- * the top-left position of the returned image is returned in (left,top).
- * These will most likely not match the coordinates returned by BoundingBox.
- * If you do not supply an original image, you will get a binary one. Use
- * pixDestroy
to delete the image after use.
- *
- * @param handle the TessPageIterator instance
- * @param level PageIteratorLevel
- * @param padding
- * @param original_image
- * @param left
- * @param top
- * @return
- */
- Pix TessPageIteratorGetImage(TessPageIterator handle, int level, int padding, Pix original_image, IntBuffer left, IntBuffer top);
-
- /**
- * Returns the baseline of the current object at the given level. The
- * baseline is the line that passes through (x1, y1) and (x2, y2).Init
, SetImage
, End
or deleting
- * the TessBaseAPI. Pointsize is returned in printers points (1/72 inch).
- *
- * @param handle the TessResultIterator instance
- * @param is_bold font attribute
- * @param is_italic font attribute
- * @param is_underlined font attribute
- * @param is_monospace font attribute
- * @param is_serif font attribute
- * @param is_smallcaps font attribute
- * @param pointsize font attribute
- * @param font_id font attribute
- * @return font name
- */
- String TessResultIteratorWordFontAttributes(TessResultIterator handle, IntBuffer is_bold,
- IntBuffer is_italic, IntBuffer is_underlined, IntBuffer is_monospace, IntBuffer is_serif,
- IntBuffer is_smallcaps, IntBuffer pointsize, IntBuffer font_id);
-
- /**
- * Returns TRUE if the current word was found in a dictionary.
- *
- * @param handle the TessResultIterator instance
- * @return 1 if word is from dictionary
- */
- int TessResultIteratorWordIsFromDictionary(TessResultIterator handle);
-
- /**
- * Returns TRUE if the current word is numeric.
- *
- * @param handle the TessResultIterator instance
- * @return 1 if word is numeric
- */
- int TessResultIteratorWordIsNumeric(TessResultIterator handle);
-
- /**
- * Returns TRUE if the current symbol is a superscript. If iterating at a
- * higher level object than symbols, e.g., words, then this will return the
- * attributes of the first symbol in that word.
- *
- * @param handle the TessResultIterator instance
- * @return 1 if symbol is superscript
- */
- int TessResultIteratorSymbolIsSuperscript(TessResultIterator handle);
-
- /**
- * Returns TRUE if the current symbol is a subscript. If iterating at a
- * higher level object than symbols, e.g., words, then this will return the
- * attributes of the first symbol in that word.
- *
- * @param handle the TessResultIterator instance
- * @return 1 if symbol is subscript
- */
- int TessResultIteratorSymbolIsSubscript(TessResultIterator handle);
-
- /**
- * Returns TRUE if the current symbol is a dropcap. If iterating at a higher
- * level object than symbols, e.g., words, then this will return the
- * attributes of the first symbol in that word.
- *
- * @param handle the TessResultIterator instance
- * @return 1 if symbol is dropcap
- */
- int TessResultIteratorSymbolIsDropcap(TessResultIterator handle);
-
- /* Choice iterator */
- TessChoiceIterator TessResultIteratorGetChoiceIterator(TessResultIterator handle);
-
- void TessChoiceIteratorDelete(TessChoiceIterator handle);
-
- int TessChoiceIteratorNext(TessChoiceIterator handle);
-
- String TessChoiceIteratorGetUTF8Text(TessChoiceIterator handle);
-
- float TessChoiceIteratorConfidence(TessChoiceIterator handle);
-}
diff --git a/Tess4J/src/net/sourceforge/tess4j/TessAPI1.java b/Tess4J/src/net/sourceforge/tess4j/TessAPI1.java
deleted file mode 100644
index c113b86..0000000
--- a/Tess4J/src/net/sourceforge/tess4j/TessAPI1.java
+++ /dev/null
@@ -1,1228 +0,0 @@
-/**
- * Copyright @ 2012 Quan Nguyen
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package net.sourceforge.tess4j;
-
-import java.nio.ByteBuffer;
-import java.nio.DoubleBuffer;
-import java.nio.FloatBuffer;
-import java.nio.IntBuffer;
-
-import com.sun.jna.Library;
-import com.sun.jna.Native;
-import com.sun.jna.Pointer;
-import com.sun.jna.ptr.IntByReference;
-import com.sun.jna.ptr.PointerByReference;
-
-import com.ochafik.lang.jnaerator.runtime.NativeSize;
-import net.sourceforge.lept4j.Boxa;
-import net.sourceforge.lept4j.Pix;
-import net.sourceforge.tess4j.util.LoadLibs;
-
-/**
- * A Java wrapper for Tesseract OCR 3.04 API
using
- * JNA Direct Mapping
.
- */
-public class TessAPI1 implements Library, ITessAPI {
-
- static {
- Native.register(LoadLibs.getTesseractLibName());
- }
-
- /**
- * Gets the version identifier.
- *
- * @return the version identifier
- */
- public static native String TessVersion();
-
- /**
- * Deallocates the memory block occupied by text.
- *
- * @param text the pointer to text
- */
- public static native void TessDeleteText(Pointer text);
-
- /**
- * Deallocates the memory block occupied by text array.
- *
- * @param arr text array pointer reference
- */
- public static native void TessDeleteTextArray(PointerByReference arr);
-
- /**
- * Deallocates the memory block occupied by integer array.
- *
- * @param arr int array
- */
- public static native void TessDeleteIntArray(IntBuffer arr);
-
- /* Renderer API */
- public static native TessResultRenderer TessTextRendererCreate(String outputbase);
-
- public static native TessResultRenderer TessHOcrRendererCreate(String outputbase);
-
- public static native TessResultRenderer TessHOcrRendererCreate2(String outputbase, int font_info);
-
- public static native TessResultRenderer TessPDFRendererCreate(String outputbase, String datadir);
-
- public static native TessResultRenderer TessPDFRendererCreateTextonly(String outputbase, String datadir, int textonly);
-
- public static native TessResultRenderer TessUnlvRendererCreate(String outputbase);
-
- public static native TessResultRenderer TessBoxTextRendererCreate(String outputbase);
-
- public static native void TessDeleteResultRenderer(TessResultRenderer renderer);
-
- public static native void TessResultRendererInsert(TessResultRenderer renderer, TessResultRenderer next);
-
- public static native TessResultRenderer TessResultRendererNext(TessResultRenderer renderer);
-
- public static native int TessResultRendererBeginDocument(TessResultRenderer renderer, String title);
-
- public static native int TessResultRendererAddImage(TessResultRenderer renderer, PointerByReference api);
-
- public static native int TessResultRendererEndDocument(TessResultRenderer renderer);
-
- public static native Pointer TessResultRendererExtention(TessResultRenderer renderer);
-
- public static native Pointer TessResultRendererTitle(TessResultRenderer renderer);
-
- public static native int TessResultRendererImageNum(TessResultRenderer renderer);
-
- /**
- * Creates an instance of the base class for all Tesseract APIs.
- *
- * @return the TesseractAPI instance
- */
- public static native TessBaseAPI TessBaseAPICreate();
-
- /**
- * Disposes the TesseractAPI instance.
- *
- * @param handle the TesseractAPI instance
- */
- public static native void TessBaseAPIDelete(TessBaseAPI handle);
-
- /**
- * Set the name of the input file. Needed only for training and reading a
- * UNLV zone file, and for searchable PDF output.
- *
- * @param handle the TesseractAPI instance
- * @param name name of the input file
- */
- public static native void TessBaseAPISetInputName(TessBaseAPI handle, String name);
-
- /**
- * These functions are required for searchable PDF output. We need our hands
- * on the input file so that we can include it in the PDF without
- * transcoding. If that is not possible, we need the original image.
- * Finally, resolution metadata is stored in the PDF so we need that as
- * well.
- *
- * @param handle the TesseractAPI instance
- * @return input file name
- */
- public static native String TessBaseAPIGetInputName(TessBaseAPI handle);
-
- public static native void TessBaseAPISetInputImage(TessBaseAPI handle, Pix pix);
-
- public static native Pix TessBaseAPIGetInputImage(TessBaseAPI handle);
-
- public static native int TessBaseAPIGetSourceYResolution(TessBaseAPI handle);
-
- public static native String TessBaseAPIGetDatapath(TessBaseAPI handle);
-
- /**
- * Set the name of the bonus output files. Needed only for debugging.
- *
- * @param handle the TesseractAPI instance
- * @param name name of the output file
- */
- public static native void TessBaseAPISetOutputName(TessBaseAPI handle, String name);
-
- /**
- * Set the value of an internal "parameter." Supply the name of the
- * parameter and the value as a string, just as you would in a config file.
- * Returns false if the name lookup failed. E.g.,
- * SetVariable("tessedit_char_blacklist", "xyz");
to ignore x,
- * y and z. Or SetVariable("classify_bln_numeric_mode", "1");
- * to set numeric-only mode. SetVariable
may be used before
- * Init
, but settings will revert to defaults on
- * End()
.Init()
. Only works for non-init
- * variables (init variables should be passed to Init()
).
- *
- *
- * @param handle the TesseractAPI instance
- * @param name name of the input
- * @param value variable value
- * @return 1 on success
- */
- public static native int TessBaseAPISetVariable(TessBaseAPI handle, String name, String value);
-
- /**
- * Get the value of an internal int parameter.
- *
- * @param handle the TesseractAPI instance
- * @param name name of the input
- * @param value pass the int buffer value
- * @return 1 on success
- */
- public static native int TessBaseAPIGetIntVariable(TessBaseAPI handle, String name, IntBuffer value);
-
- /**
- * Get the value of an internal bool parameter.
- *
- * @param handle the TesseractAPI instance
- * @param name pass the name of the variable
- * @param value pass the int buffer value
- * @return 1 on success
- */
- public static native int TessBaseAPIGetBoolVariable(TessBaseAPI handle, String name, IntBuffer value);
-
- /**
- * Get the value of an internal double parameter.
- *
- * @param handle the TesseractAPI instance
- * @param name pass the name of the variable
- * @param value pass the double buffer value
- * @return 1 on success
- */
- public static native int TessBaseAPIGetDoubleVariable(TessBaseAPI handle, String name, DoubleBuffer value);
-
- /**
- * Get the value of an internal string parameter.
- *
- * @param handle the TesseractAPI instance
- * @param name pass the name of the variable
- * @return the string value
- */
- public static native String TessBaseAPIGetStringVariable(TessBaseAPI handle, String name);
-
- /**
- * Print Tesseract parameters to the given file.SetVariable
on some of the Params in classify and textord.
- * If you do, then the effect will be to change it for all your
- * instances.Init
are those listed
- * above here in the class definition.Init
multiple times on the same instance to change language,
- * or just to reset the classifier. Languages may specify internally that
- * they want to be loaded with one or more other languages, so the ~
- * sign is available to override that. E.g., if hin
were set to
- * load eng
by default, then hin+~eng
would force
- * loading only hin
. The number of loaded languages is limited
- * only by memory, with the caveat that loading additional languages will
- * impact both speed and accuracy, as there is more work to do to decide on
- * the applicable language, and there is more chance of hallucinating
- * incorrect words. WARNING: On changing languages, all Tesseract parameters
- * are reset back to their default values. (Which may vary between
- * languages.) If you have a rare need to set a Variable that controls
- * initialization for a second call to Init
you should
- * explicitly call End()
and then use SetVariable
- * before Init
.Init
.set_only_non_debug_params
is true, only params that do
- * not contain "debug" in the name will be set.
- *
- * @param handle the TesseractAPI instance
- * @param datapath The datapath
must be the name of the parent
- * directory of tessdata
and must end in
- * /. Any name after the last / will be stripped.
- * @param language The language is (usually) an ISO 639-3
- * string or NULL
will default to eng
. The
- * language may be a string of the form [~]<lang>[+[~]<lang>]
- * indicating that multiple languages are to be loaded. E.g.,
- * hin+eng
will load Hindi and English.
- * @param oem ocr engine mode
- * @param configs pointer configuration
- * @param configs_size pointer configuration size
- * @return 0 on success and -1 on initialization failure
- */
- public static native int TessBaseAPIInit1(TessBaseAPI handle, String datapath, String language, int oem,
- PointerByReference configs, int configs_size);
-
- /**
- * @param handle the TesseractAPI instance
- * @param datapath The datapath
must be the name of the parent
- * directory of tessdata
and must end in
- * /. Any name after the last / will be stripped.
- * @param language The language is (usually) an ISO 639-3
- * string or NULL
will default to eng
. The
- * language may be a string of the form [~]<lang>[+[~]<lang>]
- * indicating that multiple languages are to be loaded. E.g.,
- * hin+eng
will load Hindi and English.
- * @param oem ocr engine mode
- * @return 0 on success and -1 on initialization failure
- */
- public static native int TessBaseAPIInit2(TessBaseAPI handle, String datapath, String language, int oem);
-
- /**
- * @param handle the TesseractAPI instance
- * @param datapath The datapath
must be the name of the parent
- * directory of tessdata
and must end in
- * /. Any name after the last / will be stripped.
- * @param language The language is (usually) an ISO 639-3
- * string or NULL
will default to eng
. The
- * language may be a string of the form [~]<lang>[+[~]<lang>]
- * indicating that multiple languages are to be loaded. E.g.,
- * hin+eng
will load Hindi and English.
- * @return 0 on success and -1 on initialization failure
- */
- public static native int TessBaseAPIInit3(TessBaseAPI handle, String datapath, String language);
-
- /**
- *
- * @param handle the TesseractAPI instance
- * @param datapath The datapath
must be the name of the parent
- * directory of tessdata
and must end in
- * /. Any name after the last / will be stripped.
- * @param language The language is (usually) an ISO 639-3
- * string or NULL
will default to eng
. The
- * language may be a string of the form [~]<lang>[+[~]<lang>]
- * indicating that multiple languages are to be loaded. E.g.,
- * hin+eng
will load Hindi and English.
- * @param oem ocr engine mode
- * @param configs pointer configuration
- * @param configs_size pointer configuration size
- * @param vars_vec
- * @param vars_values
- * @param vars_vec_size
- * @param set_only_non_debug_params
- * @return 0 on success and -1 on initialization failure
- */
- public static native int TessBaseAPIInit4(TessBaseAPI handle, String datapath, String language, int oem, PointerByReference configs, int configs_size, PointerByReference vars_vec, PointerByReference vars_values, NativeSize vars_vec_size, int set_only_non_debug_params);
-
- /**
- * Returns the languages string used in the last valid initialization. If
- * the last initialization specified "deu+hin" then that will be returned.
- * If hin
loaded eng
automatically as well, then
- * that will not be included in this list. To find the languages actually
- * loaded, use GetLoadedLanguagesAsVector
. The returned string
- * should NOT be deleted.
- *
- * @param handle the TesseractAPI instance
- * @return languages as string
- */
- public static native String TessBaseAPIGetInitLanguagesAsString(TessBaseAPI handle);
-
- /**
- * Returns the loaded languages in the vector of STRINGs. Includes all
- * languages loaded by the last Init
, including those loaded as
- * dependencies of other loaded languages.
- *
- * @param handle the TesseractAPI instance
- * @return loaded languages as vector
- */
- public static native PointerByReference TessBaseAPIGetLoadedLanguagesAsVector(TessBaseAPI handle);
-
- /**
- * Returns the available languages in the vector of STRINGs.
- *
- * @param handle the TesseractAPI instance
- * @return available languages as vector
- */
- public static native PointerByReference TessBaseAPIGetAvailableLanguagesAsVector(TessBaseAPI handle);
-
- /**
- * Init only the lang model component of Tesseract. The only functions that
- * work after this init are SetVariable
and
- * IsValidWord
. WARNING: temporary! This function will be
- * removed from here and placed in a separate API at some future time.
- *
- * @param handle the TesseractAPI instance
- * @param datapath The datapath
must be the name of the parent
- * directory of tessdata
and must end in
- * /. Any name after the last / will be stripped.
- * @param language The language is (usually) an ISO 639-3
- * string or NULL
will default to eng. The language may be a
- * string of the form [~]<lang>[+[~]<lang>] indicating that
- * multiple languages are to be loaded. E.g., hin+eng will load Hindi and
- * English.
- * @return api init language mode
- */
- public static native int TessBaseAPIInitLangMod(TessBaseAPI handle, String datapath, String language);
-
- /**
- * Init only for page layout analysis. Use only for calls to
- * SetImage
and AnalysePage
. Calls that attempt
- * recognition will generate an error.
- *
- * @param handle the TesseractAPI instance
- */
- public static native void TessBaseAPIInitForAnalysePage(TessBaseAPI handle);
-
- /**
- * Read a "config" file containing a set of param, value pairs. Searches the
- * standard places: tessdata/configs
,
- * tessdata/tessconfigs
and also accepts a relative or absolute
- * path name. Note: only non-init params will be set (init params are set by
- * Init()
).
- *
- *
- * @param handle the TesseractAPI instance
- * @param filename relative or absolute path for the "config" file
- * containing a set of param and value pairs
- * @param init_only
- */
- public static native void TessBaseAPIReadConfigFile(TessBaseAPI handle, String filename, int init_only);
-
- /**
- * Set the current page segmentation mode. Defaults to
- * PSM_SINGLE_BLOCK
. The mode is stored as an IntParam so it
- * can also be modified by ReadConfigFile
or
- * SetVariable("tessedit_pageseg_mode", mode as string)
.
- *
- * @param handle the TesseractAPI instance
- * @param mode tesseract page segment mode
- */
- public static native void TessBaseAPISetPageSegMode(TessBaseAPI handle, int mode);
-
- /**
- * Return the current page segmentation mode.
- *
- * @param handle the TesseractAPI instance
- * @return page segment mode value
- */
- public static native int TessBaseAPIGetPageSegMode(TessBaseAPI handle);
-
- /**
- * Recognize a rectangle from an image and return the result as a string.
- * May be called many times for a single Init
. Currently has no
- * error checking. Greyscale of 8 and color of 24 or 32 bits per pixel may
- * be given. Palette color images will not work properly and must be
- * converted to 24 bit. Binary images of 1 bit per pixel may also be given
- * but they must be byte packed with the MSB of the first byte being the
- * first pixel, and a 1 represents WHITE. For binary images set
- * bytes_per_pixel=0. The recognized text is returned as a char* which is
- * coded as UTF8 and must be freed with the delete [] operator.TesseractRect
is the simplified convenience
- * interface. For advanced uses, use SetImage
, (optionally)
- * SetRectangle
, Recognize
, and one or more of the
- * Get*Text
functions below.
- *
- * @param handle the TesseractAPI instance
- * @param imagedata image byte buffer
- * @param bytes_per_pixel bytes per pixel
- * @param bytes_per_line bytes per line
- * @param left image left
- * @param top image top
- * @param width image width
- * @param height image height
- * @return the pointer to recognized text
- */
- public static native Pointer TessBaseAPIRect(TessBaseAPI handle, ByteBuffer imagedata,
- int bytes_per_pixel, int bytes_per_line, int left, int top, int width, int height);
-
- /**
- * Call between pages or documents etc to free up memory and forget adaptive
- * data.
- *
- * @param handle the TesseractAPI instance
- */
- public static native void TessBaseAPIClearAdaptiveClassifier(TessBaseAPI handle);
-
- /**
- * Provide an image for Tesseract to recognize. Format is as
- * TesseractRect
above. Does not copy the image buffer, or take
- * ownership. The source image may be destroyed after Recognize
is called,
- * either explicitly or implicitly via one of the Get*Text
- * functions. SetImage
clears all recognition results, and sets
- * the rectangle to the full image, so it may be followed immediately by a
- * GetUTF8Text
, and it will automatically perform recognition.
- *
- * @param handle the TesseractAPI instance
- * @param imagedata image byte buffer
- * @param width image width
- * @param height image height
- * @param bytes_per_pixel bytes per pixel
- * @param bytes_per_line bytes per line
- */
- public static native void TessBaseAPISetImage(TessBaseAPI handle, ByteBuffer imagedata, int width,
- int height, int bytes_per_pixel, int bytes_per_line);
-
- /**
- * Provide an image for Tesseract to recognize. As with
- * SetImage
above, Tesseract doesn't take a copy or ownership
- * or pixDestroy
the image, so it must persist until after
- * Recognize
. Pix
vs raw, which to use? Use
- * Pix
where possible. A future version of Tesseract may choose
- * to use Pix
as its internal representation and discard
- * IMAGE
altogether. Because of that, an implementation that
- * sources and targets Pix
may end up with less copies than an
- * implementation that does not.
- *
- * @param handle the TesseractAPI instance
- * @param pix
- */
- public static native void TessBaseAPISetImage2(TessBaseAPI handle, Pix pix);
-
- /**
- * Set the resolution of the source image in pixels per inch so font size
- * information can be calculated in results. Call this after
- * SetImage()
.
- *
- * @param handle the TesseractAPI instance
- * @param ppi source resolution value
- */
- public static native void TessBaseAPISetSourceResolution(TessBaseAPI handle, int ppi);
-
- /**
- * Restrict recognition to a sub-rectangle of the image. Call after
- * SetImage
. Each SetRectangle
clears the
- * recognition results so multiple rectangles can be recognized with the
- * same image.
- *
- * @param handle the TesseractAPI instance
- * @param left value
- * @param top value
- * @param width value
- * @param height value
- */
- public static native void TessBaseAPISetRectangle(TessBaseAPI handle, int left, int top, int width,
- int height);
-
- /**
- * ONLY available after SetImage
if you have Leptonica
- * installed. Get a copy of the internal thresholded image from Tesseract.
- *
- * @param handle the TesseractAPI instance
- * @return internal thresholded image
- */
- public static native Pix TessBaseAPIGetThresholdedImage(TessBaseAPI handle);
-
- /**
- * Get the result of page layout analysis as a Leptonica-style
- * Boxa
, Pixa
pair, in reading order. Can be
- * called before or after Recognize
.
- *
- * @param handle the TesseractAPI instance
- * @param pixa array of Pix
- * @return array of Box
- */
- public static native Boxa TessBaseAPIGetRegions(TessBaseAPI handle, PointerByReference pixa);
-
- /**
- * Get the textlines as a Leptonica-style Boxa
,
- * Pixa
pair, in reading order. Can be called before or after
- * Recognize
. If blockids
is not NULL
, the
- * block-id of each line is also returned as an array of one element per
- * line. delete [] after use. If paraids
is not
- * NULL
, the paragraph-id of each line within its block is also
- * returned as an array of one element per line. delete [] after use.Boxa
,
- * Pixa
pair, in reading order. Can be called before or after
- * Recognize
. If blockids
is not NULL
, the
- * block-id of each line is also returned as an array of one element per
- * line. delete [] after use. If paraids
is not
- * NULL
, the paragraph-id of each line within its block is also
- * returned as an array of one element per line. delete [] after use.
- *
- * @param handle the TesseractAPI instance
- * @param raw_image
- * @param raw_padding
- * @param pixa array of Pix
- * @param blockids
- * @param paraids
- * @return array of Box
- */
- public static native Boxa TessBaseAPIGetTextlines1(TessBaseAPI handle, int raw_image, int raw_padding, PointerByReference pixa, PointerByReference blockids, PointerByReference paraids);
-
- /**
- * Get textlines and strips of image regions as a Leptonica-style
- * Boxa
, Pixa
pair, in reading order. Enables
- * downstream handling of non-rectangular regions. Can be called before or
- * after Recognize
. If blockids
is not NULL, the block-id of
- * each line is also returned as an array of one element per line. delete []
- * after use.
- *
- * @param handle the TesseractAPI instance
- * @param pixa array of Pix
- * @param blockids
- * @return array of Box
- */
- public static native Boxa TessBaseAPIGetStrips(TessBaseAPI handle, PointerByReference pixa, PointerByReference blockids);
-
- /**
- * Get the words as a Leptonica-style Boxa
, Pixa
- * pair, in reading order. Can be called before or after
- * Recognize
.
- *
- * @param handle the TesseractAPI instance
- * @param pixa array of Pix
- * @return array of Box
- */
- public static native Boxa TessBaseAPIGetWords(TessBaseAPI handle, PointerByReference pixa);
-
- /**
- * Gets the individual connected (text) components (created after pages
- * segmentation step, but before recognition) as a Leptonica-style
- * Boxa
, Pixa
pair, in reading order. Can be
- * called before or after Recognize
.
- *
- * @param handle the TesseractAPI instance
- * @param cc array of Pix
- * @return array of Box
- */
- public static native Boxa TessBaseAPIGetConnectedComponents(TessBaseAPI handle, PointerByReference cc);
-
- /**
- * Get the given level kind of components (block, textline, word etc.) as a
- * Leptonica-style Boxa
, Pixa
pair, in reading
- * order. Can be called before or after Recognize
. If blockids
- * is not NULL
, the block-id of each component is also returned
- * as an array of one element per component. delete [] after use. If
- * text_only
is true, then only text components are returned.
- * Helper function to get binary images with no padding (most common usage).
- *
- * @param handle the TesseractAPI instance
- * @param level PageIteratorLevel
- * @param text_only
- * @param pixa array of Pix
- * @param blockids
- * @return array of Box
- */
- public static native Boxa TessBaseAPIGetComponentImages(TessBaseAPI handle, int level, int text_only, PointerByReference pixa, PointerByReference blockids);
-
- /**
- * Get the given level kind of components (block, textline, word etc.) as a
- * Leptonica-style Boxa
, Pixa
pair, in reading
- * order. Can be called before or after Recognize
. If blockids
- * is not NULL
, the block-id of each component is also returned
- * as an array of one element per component. delete [] after use. If
- * paraids
is not NULL
, the paragraph-id of each
- * component with its block is also returned as an array of one element per
- * component. delete [] after use. If raw_image
is true, then
- * portions of the original image are extracted instead of the thresholded
- * image and padded with raw_padding. If text_only
is true,
- * then only text components are returned.
- *
- * @param handle the TesseractAPI instance
- * @param level PageIteratorLevel
- * @param text_only
- * @param raw_image
- * @param raw_padding
- * @param pixa array of Pix
- * @param blockids
- * @param paraids
- * @return
- */
- public static native Boxa TessBaseAPIGetComponentImages1(TessBaseAPI handle, int level, int text_only, int raw_image, int raw_padding, PointerByReference pixa, PointerByReference blockids, PointerByReference paraids);
-
- /**
- * @param handle the TesseractAPI instance
- * @return Scale factor from original image.
- */
- public static native int TessBaseAPIGetThresholdedImageScaleFactor(TessBaseAPI handle);
-
- /**
- * Dump the internal binary image to a PGM file.
- *
- * @param handle the TesseractAPI instance
- * @param filename pgm file name
- */
- public static native void TessBaseAPIDumpPGM(TessBaseAPI handle, String filename);
-
- /**
- * Runs page layout analysis in the mode set by SetPageSegMode
.
- * May optionally be called prior to Recognize
to get access to
- * just the page layout results. Returns an iterator to the results. Returns
- * NULL
on error. The returned iterator must be deleted after
- * use. WARNING! This class points to data held within the
- * TessBaseAPI
class, and therefore can only be used while the
- * TessBaseAPI
class still exists and has not been subjected to
- * a call of Init
, SetImage
,
- * Recognize
, Clear
, End
, DetectOS,
- * or anything else that changes the internal PAGE_RES
.
- *
- * @param handle the TesseractAPI instance
- * @return returns an iterator to the results. Returns NULL on error. The
- * returned iterator must be deleted after use.
- */
- public static native TessPageIterator TessBaseAPIAnalyseLayout(TessBaseAPI handle);
-
- /**
- * Recognize the image from SetAndThresholdImage
, generating
- * Tesseract internal structures. Returns 0 on success. Optional. The
- * Get*Text
functions below will call Recognize
if
- * needed. After Recognize
, the output is kept internally until
- * the next SetImage
.
- *
- * @param handle the TesseractAPI instance
- * @param monitor the result as Tesseract internal structures
- * @return 0 on success
- */
- public static native int TessBaseAPIRecognize(TessBaseAPI handle, ETEXT_DESC monitor);
-
- /**
- * Variant on Recognize
used for testing chopper.
- *
- * @param handle the TesseractAPI instance
- * @param monitor the result as Tesseract internal structures
- * @return 0 on success
- */
- public static native int TessBaseAPIRecognizeForChopTest(TessBaseAPI handle, ETEXT_DESC monitor);
-
- /**
- * Get a reading-order iterator to the results of LayoutAnalysis and/or
- * Recognize
. The returned iterator must be deleted after use.
- * WARNING! This class points to data held within the
- * TessBaseAPI
class, and therefore can only be used while the
- * TessBaseAPI
class still exists and has not been subjected to
- * a call of Init
, SetImage
,
- * Recognize
, Clear
, End
, DetectOS,
- * or anything else that changes the internal PAGE_RES.
- *
- * @param handle the TesseractAPI instance
- * @return the result iterator
- */
- public static native TessResultIterator TessBaseAPIGetIterator(TessBaseAPI handle);
-
- /**
- * Get a mutable iterator to the results of LayoutAnalysis and/or
- * Recognize
. The returned iterator must be deleted after use.
- * WARNING! This class points to data held within the
- * TessBaseAPI
class, and therefore can only be used while the
- * TessBaseAPI
class still exists and has not been subjected to
- * a call of Init
, SetImage
,
- * Recognize
, Clear
, End
, DetectOS,
- * or anything else that changes the internal PAGE_RES
.
- *
- * @param handle the TesseractAPI instance
- * @return the mutable iterator
- */
- public static native TessMutableIterator TessBaseAPIGetMutableIterator(TessBaseAPI handle);
-
- /**
- * Recognizes all the pages in the named file, as a multi-page tiff or list
- * of filenames, or single image, and gets the appropriate kind of text
- * according to parameters: tessedit_create_boxfile
,
- * tessedit_make_boxes_from_boxes
,
- * tessedit_write_unlv
, tessedit_create_hocr
.
- * Calls ProcessPage on each page in the input file, which may be a
- * multi-page tiff, single-page other file format, or a plain text list of
- * images to read. If tessedit_page_number is non-negative, processing
- * begins at that page of a multi-page tiff file, or filelist. The text is
- * returned in text_out. Returns false on error. If non-zero
- * timeout_millisec terminates processing after the timeout on a single
- * page. If non-NULL and non-empty, and some page fails for some reason, the
- * page is reprocessed with the retry_config config file. Useful for
- * interactively debugging a bad page.
- *
- * @param handle the TesseractAPI instance
- * @param filename multi-page tiff or list of filenames
- * @param retry_config retry config values
- * @param timeout_millisec timeout value
- * @param renderer result renderer
- * @return the status
- */
- public static native int TessBaseAPIProcessPages(TessBaseAPI handle, String filename, String retry_config, int timeout_millisec, TessResultRenderer renderer);
-
- public static native int TessBaseAPIProcessPage(TessBaseAPI handle, Pix pix, int page_index, String filename, String retry_config, int timeout_millisec, TessResultRenderer renderer);
-
- /**
- * The recognized text is returned as a char* which is coded as UTF-8 and
- * must be freed with the delete [] operator.
- *
- * @param handle the TesseractAPI instance
- * @return the pointer to output text
- */
- public static native Pointer TessBaseAPIGetUTF8Text(TessBaseAPI handle);
-
- /**
- * Make a HTML-formatted string with hOCR markup from the internal data
- * structures. page_number is 0-based but will appear in the output as
- * 1-based.
- *
- * @param handle the TesseractAPI instance
- * @param page_number page number
- * @return the pointer to hOCR text
- */
- public static native Pointer TessBaseAPIGetHOCRText(TessBaseAPI handle, int page_number);
-
- /**
- * The recognized text is returned as a char* which is coded as a UTF8 box
- * file and must be freed with the delete [] operator. page_number is a
- * 0-base page index that will appear in the box file.
- *
- * @param handle the TesseractAPI instance
- * @param page_number number of the page
- * @return the pointer to box text
- */
- public static native Pointer TessBaseAPIGetBoxText(TessBaseAPI handle, int page_number);
-
- /**
- * The recognized text is returned as a char* which is coded as UNLV format
- * Latin-1 with specific reject and suspect codes and must be freed with the
- * delete [] operator.
- *
- * @param handle the TesseractAPI instance
- * @return the pointer to UNLV text
- */
- public static native Pointer TessBaseAPIGetUNLVText(TessBaseAPI handle);
-
- /**
- * Returns the average word confidence for Tesseract page result.
- *
- * @param handle the TesseractAPI instance
- * @return the (average) confidence value between 0 and 100.
- */
- public static native int TessBaseAPIMeanTextConf(TessBaseAPI handle);
-
- /**
- * Returns an array of all word confidences, terminated by -1. The calling
- * function must delete [] after use. The number of confidences should
- * correspond to the number of space-delimited words in
- * GetUTF8Text
.
- *
- * @param handle the TesseractAPI instance
- * @return all word confidences (between 0 and 100) in an array, terminated
- * by -1
- */
- public static native IntByReference TessBaseAPIAllWordConfidences(TessBaseAPI handle);
-
- /**
- * Applies the given word to the adaptive classifier if possible. The word
- * must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can tell the
- * boundaries of the graphemes. Assumes that
- * SetImage
/SetRectangle
have been used to set the
- * image to the given word. The mode arg should be
- * PSM_SINGLE_WORD
or PSM_CIRCLE_WORD
, as that
- * will be used to control layout analysis. The currently set PageSegMode is
- * preserved.
- *
- * @param handle the TesseractAPI instance
- * @param mode tesseract page segment mode
- * @param wordstr The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s ,
- * so it can tell the boundaries of the graphemes.
- * @return false if adaption was not possible for some reason.
- */
- public static native int TessBaseAPIAdaptToWordStr(TessBaseAPI handle, int mode, String wordstr);
-
- /**
- * Free up recognition results and any stored image data, without actually
- * freeing any recognition data that would be time-consuming to reload.
- * Afterwards, you must call SetImage
or
- * TesseractRect
before doing any Recognize
or
- * Get*
operation.
- *
- * @param handle the TesseractAPI instance
- */
- public static native void TessBaseAPIClear(TessBaseAPI handle);
-
- /**
- * Close down tesseract and free up all memory. End()
is
- * equivalent to destructing and reconstructing your TessBaseAPI. Once
- * End()
has been used, none of the other API functions may be
- * used other than Init
and anything declared above it in the
- * class definition.
- *
- * @param handle the TesseractAPI instance
- */
- public static native void TessBaseAPIEnd(TessBaseAPI handle);
-
- /**
- * Check whether a word is valid according to Tesseract's language model.
- *
- * @param handle the TesseractAPI instance
- * @param word word value
- * @return 0 if the word is invalid, non-zero if valid
- */
- public static native int TessBaseAPIIsValidWord(TessBaseAPI handle, String word);
-
- /**
- * Gets text direction.
- *
- * @param handle the TesseractAPI instance
- * @param out_offset offset
- * @param out_slope slope
- * @return TRUE if text direction is valid
- */
- public static native int TessBaseAPIGetTextDirection(TessBaseAPI handle, IntBuffer out_offset,
- FloatBuffer out_slope);
-
- /**
- * Clear any library-level memory caches. There are a variety of
- * expensive-to-load constant data structures (mostly language dictionaries)
- * that are cached globally -- surviving the Init()
and
- * End()
of individual TessBaseAPI's. This function allows the
- * clearing of these caches.
- *
- * @param handle the TesseractAPI instance
- */
- public static native void TessBaseAPIClearPersistentCache(TessBaseAPI handle);
-
- /**
- * Detect the orientation of the input image and apparent script (alphabet).
- * orient_deg
is the detected clockwise rotation of the input image in
- * degrees (0, 90, 180, 270); orient_conf
is the confidence (15.0 is
- * reasonably confident); script_name
is an ASCII string, the name of the
- * script, e.g. "Latin"; script_conf
is confidence level in the script.
- *
- * @return TRUE on success and writes values to each parameter as an output
- */
- public static native int TessBaseAPIDetectOrientationScript(TessBaseAPI handle, IntBuffer orient_deg, FloatBuffer orient_conf, PointerByReference script_name, FloatBuffer script_conf);
-
- /**
- * Gets the string of the specified unichar.
- *
- * @param handle the TesseractAPI instance
- * @param unichar_id the unichar id
- * @return the string form of the specified unichar.
- */
- public static native String TessBaseAPIGetUnichar(TessBaseAPI handle, int unichar_id);
-
- /**
- * Deletes the specified PageIterator instance.
- *
- * @param handle the TessPageIterator instance
- */
- public static native void TessPageIteratorDelete(TessPageIterator handle);
-
- /**
- * Creates a copy of the specified PageIterator instance.
- *
- * @param handle the TessPageIterator instance
- * @return page iterator copy
- */
- public static native TessPageIterator TessPageIteratorCopy(TessPageIterator handle);
-
- /**
- * Resets the iterator to point to the start of the page.
- *
- * @param handle the TessPageIterator instance
- */
- public static native void TessPageIteratorBegin(TessPageIterator handle);
-
- /**
- * Moves to the start of the next object at the given level in the page
- * hierarchy, and returns false if the end of the page was reached. NOTE
- * (CHANGED!) that ALL PageIteratorLevel level values will visit each
- * non-text block at least once.PTIsTextType(BLockType())
is false for non-text blocks.pixDestroy
to delete the image after use. The following
- * methods are used to generate the images: RIL_BLOCK
: mask the
- * page image with the block polygon. RIL_TEXTLINE
: Clip the
- * rectangle of the line box from the page image. TODO(rays) fix this to
- * generate and use a line polygon. RIL_WORD
: Clip the
- * rectangle of the word box from the page image. RIL_SYMBOL
:
- * Render the symbol outline to an image for cblobs (prior to recognition)
- * or the bounding box otherwise. A reconstruction of the original image
- * (using xor to check for double representation) should be reasonably
- * accurate, apart from removed noise, at the block level. Below the block
- * level, the reconstruction will be missing images and line separators. At
- * the symbol level, kerned characters will be invade the bounding box if
- * rendered after recognition, making an xor reconstruction inaccurate, but
- * an or construction better. Before recognition, symbol-level
- * reconstruction should be good, even with xor, since the images come from
- * the connected components.
- *
- * @param handle the TessPageIterator instance
- * @param level PageIteratorLevel
- * @return
- */
- public static native Pix TessPageIteratorGetBinaryImage(TessPageIterator handle, int level);
-
- /**
- * Returns an image of the current object at the given level in greyscale if
- * available in the input. To guarantee a binary image use BinaryImage. NOTE
- * that in order to give the best possible image, the bounds are expanded
- * slightly over the binary connected component, by the supplied padding, so
- * the top-left position of the returned image is returned in (left,top).
- * These will most likely not match the coordinates returned by BoundingBox.
- * If you do not supply an original image, you will get a binary one. Use
- * pixDestroy
to delete the image after use.
- *
- * @param handle the TessPageIterator instance
- * @param level PageIteratorLevel
- * @param padding
- * @param original_image
- * @param left
- * @param top
- * @return
- */
- public static native Pix TessPageIteratorGetImage(TessPageIterator handle, int level, int padding, Pix original_image, IntBuffer left, IntBuffer top);
-
- /**
- * Returns the baseline of the current object at the given level. The
- * baseline is the line that passes through (x1, y1) and (x2, y2).Init
, SetImage
, End
or deleting
- * the TessBaseAPI. Pointsize is returned in printers points (1/72 inch).
- *
- * @param handle the TessResultIterator instance
- * @param is_bold font attribute
- * @param is_italic font attribute
- * @param is_underlined font attribute
- * @param is_monospace font attribute
- * @param is_serif font attribute
- * @param is_smallcaps font attribute
- * @param pointsize font attribute
- * @param font_id font attribute
- * @return font name
- */
- public static native String TessResultIteratorWordFontAttributes(TessResultIterator handle,
- IntBuffer is_bold, IntBuffer is_italic, IntBuffer is_underlined, IntBuffer is_monospace,
- IntBuffer is_serif, IntBuffer is_smallcaps, IntBuffer pointsize, IntBuffer font_id);
-
- /**
- * Returns TRUE if the current word was found in a dictionary.
- *
- * @param handle the TessResultIterator instance
- * @return 1 if word is from dictionary
- */
- public static native int TessResultIteratorWordIsFromDictionary(TessResultIterator handle);
-
- /**
- * Returns TRUE if the current word is numeric.
- *
- * @param handle the TessResultIterator instance
- * @return 1 if word is numeric
- */
- public static native int TessResultIteratorWordIsNumeric(TessResultIterator handle);
-
- /**
- * Returns TRUE if the current symbol is a superscript. If iterating at a
- * higher level object than symbols, e.g., words, then this will return the
- * attributes of the first symbol in that word.
- *
- * @param handle the TessResultIterator instance
- * @return 1 if symbol is superscript
- */
- public static native int TessResultIteratorSymbolIsSuperscript(TessResultIterator handle);
-
- /**
- * Returns TRUE if the current symbol is a subscript. If iterating at a
- * higher level object than symbols, e.g., words, then this will return the
- * attributes of the first symbol in that word.
- *
- * @param handle the TessResultIterator instance
- * @return 1 if symbol is subscript
- */
- public static native int TessResultIteratorSymbolIsSubscript(TessResultIterator handle);
-
- /**
- * Returns TRUE if the current symbol is a dropcap. If iterating at a higher
- * level object than symbols, e.g., words, then this will return the
- * attributes of the first symbol in that word.
- *
- * @param handle the TessResultIterator instance
- * @return 1 if symbol is dropcap
- */
- public static native int TessResultIteratorSymbolIsDropcap(TessResultIterator handle);
-
- /* Choice iterator */
- public static native TessChoiceIterator TessResultIteratorGetChoiceIterator(TessResultIterator handle);
-
- public static native void TessChoiceIteratorDelete(TessChoiceIterator handle);
-
- public static native int TessChoiceIteratorNext(TessChoiceIterator handle);
-
- public static native String TessChoiceIteratorGetUTF8Text(TessChoiceIterator handle);
-
- public static native float TessChoiceIteratorConfidence(TessChoiceIterator handle);
-}
diff --git a/Tess4J/src/net/sourceforge/tess4j/Tesseract.java b/Tess4J/src/net/sourceforge/tess4j/Tesseract.java
deleted file mode 100644
index 2410e27..0000000
--- a/Tess4J/src/net/sourceforge/tess4j/Tesseract.java
+++ /dev/null
@@ -1,682 +0,0 @@
-/**
- * Copyright @ 2012 Quan Nguyen
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package net.sourceforge.tess4j;
-
-import com.sun.jna.Pointer;
-import com.sun.jna.StringArray;
-import com.sun.jna.ptr.PointerByReference;
-import java.awt.Rectangle;
-import java.awt.image.*;
-import java.io.*;
-import java.nio.ByteBuffer;
-import java.nio.IntBuffer;
-import java.util.*;
-import javax.imageio.IIOImage;
-import net.sourceforge.lept4j.Box;
-import net.sourceforge.lept4j.Boxa;
-import static net.sourceforge.lept4j.ILeptonica.L_CLONE;
-import net.sourceforge.lept4j.Leptonica;
-import static net.sourceforge.tess4j.ITessAPI.TRUE;
-
-import net.sourceforge.tess4j.ITessAPI.TessBaseAPI;
-import net.sourceforge.tess4j.ITessAPI.TessOcrEngineMode;
-import net.sourceforge.tess4j.ITessAPI.TessPageIterator;
-import net.sourceforge.tess4j.ITessAPI.TessResultIterator;
-import net.sourceforge.tess4j.ITessAPI.TessResultRenderer;
-
-import net.sourceforge.tess4j.util.ImageIOHelper;
-import net.sourceforge.tess4j.util.LoggHelper;
-import net.sourceforge.tess4j.util.PdfUtilities;
-import org.slf4j.*;
-
-/**
- * An object layer on top of TessAPI
, provides character
- * recognition support for common image formats, and multi-page TIFF images
- * beyond the uncompressed, binary TIFF format supported by Tesseract OCR
- * engine. The extended capabilities are provided by the
- * Java Advanced Imaging Image I/O Tools
.Ghost4J
, a
- * JNA
wrapper for GPL Ghostscript
, which should be
- * installed and included in system path..jar
files for jna
,
- * jai-imageio
, and ghost4j
) are in its compile and
- * run-time classpath
.
- */
-public class Tesseract implements ITesseract {
-
- private static Tesseract instance;
- private String language = "eng";
- private String datapath;
- private RenderedFormat renderedFormat = RenderedFormat.TEXT;
- private int psm = -1;
- private int ocrEngineMode = TessOcrEngineMode.OEM_DEFAULT;
- private final Properties prop = new Properties();
- private final Listtessdata
.
- *
- * @param datapath the tessdata path to set
- */
- @Override
- public void setDatapath(String datapath) {
- this.datapath = datapath;
- }
-
- /**
- * Sets language for OCR.
- *
- * @param language the language code, which follows ISO 639-3 standard.
- */
- @Override
- public void setLanguage(String language) {
- this.language = language;
- }
-
- /**
- * Sets OCR engine mode.
- *
- * @param ocrEngineMode the OcrEngineMode to set
- */
- @Override
- public void setOcrEngineMode(int ocrEngineMode) {
- this.ocrEngineMode = ocrEngineMode;
- }
-
- /**
- * Sets page segmentation mode.
- *
- * @param mode the page segmentation mode to set
- */
- @Override
- public void setPageSegMode(int mode) {
- this.psm = mode;
- }
-
- /**
- * Enables hocr output.
- *
- * @param hocr to enable or disable hocr output
- */
- public void setHocr(boolean hocr) {
- this.renderedFormat = hocr ? RenderedFormat.HOCR : RenderedFormat.TEXT;
- prop.setProperty("tessedit_create_hocr", hocr ? "1" : "0");
- }
-
- /**
- * Set the value of Tesseract's internal parameter.
- *
- * @param key variable name, e.g., tessedit_create_hocr
,
- * tessedit_char_whitelist
, etc.
- * @param value value for corresponding variable, e.g., "1", "0",
- * "0123456789", etc.
- */
- @Override
- public void setTessVariable(String key, String value) {
- prop.setProperty(key, value);
- }
-
- /**
- * Sets configs to be passed to Tesseract's Init
method.
- *
- * @param configs list of config filenames, e.g., "digits", "bazaar",
- * "quiet"
- */
- @Override
- public void setConfigs(Listnull
indicates
- * the whole image.
- * @return the recognized text
- * @throws TesseractException
- */
- @Override
- public String doOCR(File imageFile, Rectangle rect) throws TesseractException {
- try {
- return doOCR(ImageIOHelper.getIIOImageList(imageFile), imageFile.getPath(), rect);
- } catch (Exception e) {
- logger.error(e.getMessage(), e);
- throw new TesseractException(e);
- }
- }
-
- /**
- * Performs OCR operation.
- *
- * @param bi a buffered image
- * @return the recognized text
- * @throws TesseractException
- */
- @Override
- public String doOCR(BufferedImage bi) throws TesseractException {
- return doOCR(bi, null);
- }
-
- /**
- * Performs OCR operation.
- *
- * @param bi a buffered image
- * @param rect the bounding rectangle defines the region of the image to be
- * recognized. A rectangle of zero dimension or null
indicates
- * the whole image.
- * @return the recognized text
- * @throws TesseractException
- */
- @Override
- public String doOCR(BufferedImage bi, Rectangle rect) throws TesseractException {
- try {
- return doOCR(ImageIOHelper.getIIOImageList(bi), rect);
- } catch (Exception e) {
- logger.error(e.getMessage(), e);
- throw new TesseractException(e);
- }
- }
-
- /**
- * Performs OCR operation.
- *
- * @param imageList a list of IIOImage
objects
- * @param rect the bounding rectangle defines the region of the image to be
- * recognized. A rectangle of zero dimension or null
indicates
- * the whole image.
- * @return the recognized text
- * @throws TesseractException
- */
- @Override
- public String doOCR(ListIIOImage
objects
- * @param filename input file name. Needed only for training and reading a
- * UNLV zone file.
- * @param rect the bounding rectangle defines the region of the image to be
- * recognized. A rectangle of zero dimension or null
indicates
- * the whole image.
- * @return the recognized text
- * @throws TesseractException
- */
- @Override
- public String doOCR(ListSetImage
, (optionally)
- * SetRectangle
, and one or more of the Get*Text
- * functions.
- *
- * @param xsize width of image
- * @param ysize height of image
- * @param buf pixel data
- * @param rect the bounding rectangle defines the region of the image to be
- * recognized. A rectangle of zero dimension or null
indicates
- * the whole image.
- * @param bpp bits per pixel, represents the bit depth of the image, with 1
- * for binary bitmap, 8 for gray, and 24 for color RGB.
- * @return the recognized text
- * @throws TesseractException
- */
- @Override
- public String doOCR(int xsize, int ysize, ByteBuffer buf, Rectangle rect, int bpp) throws TesseractException {
- return doOCR(xsize, ysize, buf, null, rect, bpp);
- }
-
- /**
- * Performs OCR operation. Use SetImage
, (optionally)
- * SetRectangle
, and one or more of the Get*Text
- * functions.
- *
- * @param xsize width of image
- * @param ysize height of image
- * @param buf pixel data
- * @param filename input file name. Needed only for training and reading a
- * UNLV zone file.
- * @param rect the bounding rectangle defines the region of the image to be
- * recognized. A rectangle of zero dimension or null
indicates
- * the whole image.
- * @param bpp bits per pixel, represents the bit depth of the image, with 1
- * for binary bitmap, 8 for gray, and 24 for color RGB.
- * @return the recognized text
- * @throws TesseractException
- */
- @Override
- public String doOCR(int xsize, int ysize, ByteBuffer buf, String filename, Rectangle rect, int bpp) throws TesseractException {
- init();
- setTessVariables();
-
- try {
- setImage(xsize, ysize, buf, rect, bpp);
- return getOCRText(filename, 1);
- } catch (Exception e) {
- logger.error(e.getMessage(), e);
- throw new TesseractException(e);
- } finally {
- dispose();
- }
- }
-
- /**
- * Initializes Tesseract engine.
- */
- protected void init() {
- api = TessAPI.INSTANCE;
- handle = api.TessBaseAPICreate();
- StringArray sarray = new StringArray(configList.toArray(new String[0]));
- PointerByReference configs = new PointerByReference();
- configs.setPointer(sarray);
- api.TessBaseAPIInit1(handle, datapath, language, ocrEngineMode, configs, configList.size());
- if (psm > -1) {
- api.TessBaseAPISetPageSegMode(handle, psm);
- }
- }
-
- /**
- * Sets Tesseract's internal parameters.
- */
- protected void setTessVariables() {
- Enumeration> em = prop.propertyNames();
- while (em.hasMoreElements()) {
- String key = (String) em.nextElement();
- api.TessBaseAPISetVariable(handle, key, prop.getProperty(key));
- }
- }
-
- /**
- * A wrapper for {@link #setImage(int, int, ByteBuffer, Rectangle, int)}.
- *
- * @param image a rendered image
- * @param rect region of interest
- * @throws java.io.IOException
- */
- protected void setImage(RenderedImage image, Rectangle rect) throws IOException {
- setImage(image.getWidth(), image.getHeight(), ImageIOHelper.getImageByteBuffer(image), rect, image
- .getColorModel().getPixelSize());
- }
-
- /**
- * Sets image to be processed.
- *
- * @param xsize width of image
- * @param ysize height of image
- * @param buf pixel data
- * @param rect the bounding rectangle defines the region of the image to be
- * recognized. A rectangle of zero dimension or null
indicates
- * the whole image.
- * @param bpp bits per pixel, represents the bit depth of the image, with 1
- * for binary bitmap, 8 for gray, and 24 for color RGB.
- */
- protected void setImage(int xsize, int ysize, ByteBuffer buf, Rectangle rect, int bpp) {
- int bytespp = bpp / 8;
- int bytespl = (int) Math.ceil(xsize * bpp / 8.0);
- api.TessBaseAPISetImage(handle, buf, xsize, ysize, bytespp, bytespl);
-
- if (rect != null && !rect.isEmpty()) {
- api.TessBaseAPISetRectangle(handle, rect.x, rect.y, rect.width, rect.height);
- }
- }
-
- /**
- * Gets recognized text.
- *
- * @param filename input file name. Needed only for reading a UNLV zone
- * file.
- * @param pageNum page number; needed for hocr paging.
- * @return the recognized text
- */
- protected String getOCRText(String filename, int pageNum) {
- if (filename != null && !filename.isEmpty()) {
- api.TessBaseAPISetInputName(handle, filename);
- }
-
- Pointer utf8Text = renderedFormat == RenderedFormat.HOCR ? api.TessBaseAPIGetHOCRText(handle, pageNum - 1) : api.TessBaseAPIGetUTF8Text(handle);
- String str = utf8Text.getString(0);
- api.TessDeleteText(utf8Text);
- return str;
- }
-
- /**
- * Creates renderers for given formats.
- *
- * @param outputbase
- * @param formats
- * @return
- */
- private TessResultRenderer createRenderers(String outputbase, ListRectangle
- * @throws TesseractException
- */
- @Override
- public ListWord
- */
- @Override
- public ListTessAPI1
, provides character
- * recognition support for common image formats, and multi-page TIFF images
- * beyond the uncompressed, binary TIFF format supported by Tesseract OCR
- * engine. The extended capabilities are provided by the
- * Java Advanced Imaging Image I/O Tools
.Ghost4J
, a
- * JNA
wrapper for GPL Ghostscript
, which should be
- * installed and included in system path..jar
files for jna
,
- * jai-imageio
, and ghost4j
) are in its compile and
- * run-time classpath
.
- */
-public class Tesseract1 extends TessAPI1 implements ITesseract {
-
- private String language = "eng";
- private String datapath;
- private RenderedFormat renderedFormat = RenderedFormat.TEXT;
- private int psm = -1;
- private int ocrEngineMode = TessOcrEngineMode.OEM_DEFAULT;
- private final Properties prop = new Properties();
- private final Listtessdata
.
- *
- * @param datapath the tessdata path to set
- */
- @Override
- public void setDatapath(String datapath) {
- this.datapath = datapath;
- }
-
- /**
- * Sets language for OCR.
- *
- * @param language the language code, which follows ISO 639-3 standard.
- */
- @Override
- public void setLanguage(String language) {
- this.language = language;
- }
-
- /**
- * Sets OCR engine mode.
- *
- * @param ocrEngineMode the OcrEngineMode to set
- */
- @Override
- public void setOcrEngineMode(int ocrEngineMode) {
- this.ocrEngineMode = ocrEngineMode;
- }
-
- /**
- * Sets page segmentation mode.
- *
- * @param mode the page segmentation mode to set
- */
- @Override
- public void setPageSegMode(int mode) {
- this.psm = mode;
- }
-
- /**
- * Enables hocr output.
- *
- * @param hocr to enable or disable hocr output
- */
- public void setHocr(boolean hocr) {
- this.renderedFormat = hocr ? RenderedFormat.HOCR : RenderedFormat.TEXT;
- prop.setProperty("tessedit_create_hocr", hocr ? "1" : "0");
- }
-
- /**
- * Set the value of Tesseract's internal parameter.
- *
- * @param key variable name, e.g., tessedit_create_hocr
,
- * tessedit_char_whitelist
, etc.
- * @param value value for corresponding variable, e.g., "1", "0",
- * "0123456789", etc.
- */
- @Override
- public void setTessVariable(String key, String value) {
- prop.setProperty(key, value);
- }
-
- /**
- * Sets configs to be passed to Tesseract's Init
method.
- *
- * @param configs list of config filenames, e.g., "digits", "bazaar",
- * "quiet"
- */
- @Override
- public void setConfigs(Listnull
indicates
- * the whole image.
- * @return the recognized text
- * @throws TesseractException
- */
- @Override
- public String doOCR(File imageFile, Rectangle rect) throws TesseractException {
- try {
- return doOCR(ImageIOHelper.getIIOImageList(imageFile), imageFile.getPath(), rect);
- } catch (Exception e) {
- logger.error(e.getMessage(), e);
- throw new TesseractException(e);
- }
- }
-
- /**
- * Performs OCR operation.
- *
- * @param bi a buffered image
- * @return the recognized text
- * @throws TesseractException
- */
- @Override
- public String doOCR(BufferedImage bi) throws TesseractException {
- return doOCR(bi, null);
- }
-
- /**
- * Performs OCR operation.
- *
- * @param bi a buffered image
- * @param rect the bounding rectangle defines the region of the image to be
- * recognized. A rectangle of zero dimension or null
indicates
- * the whole image.
- * @return the recognized text
- * @throws TesseractException
- */
- @Override
- public String doOCR(BufferedImage bi, Rectangle rect) throws TesseractException {
- try {
- return doOCR(ImageIOHelper.getIIOImageList(bi), rect);
- } catch (Exception e) {
- logger.error(e.getMessage(), e);
- throw new TesseractException(e);
- }
- }
-
- /**
- * Performs OCR operation.
- *
- * @param imageList a list of IIOImage
objects
- * @param rect the bounding rectangle defines the region of the image to be
- * recognized. A rectangle of zero dimension or null
indicates
- * the whole image.
- * @return the recognized text
- * @throws TesseractException
- */
- @Override
- public String doOCR(ListIIOImage
objects
- * @param filename input file name
- * @param rect the bounding rectangle defines the region of the image to be
- * recognized. A rectangle of zero dimension or null
indicates
- * the whole image.
- * @return the recognized text
- * @throws TesseractException
- */
- @Override
- public String doOCR(ListSetImage
, (optionally)
- * SetRectangle
, and one or more of the Get*Text
- * functions.
- *
- * @param xsize width of image
- * @param ysize height of image
- * @param buf pixel data
- * @param rect the bounding rectangle defines the region of the image to be
- * recognized. A rectangle of zero dimension or null
indicates
- * the whole image.
- * @param bpp bits per pixel, represents the bit depth of the image, with 1
- * for binary bitmap, 8 for gray, and 24 for color RGB.
- * @return the recognized text
- * @throws TesseractException
- */
- @Override
- public String doOCR(int xsize, int ysize, ByteBuffer buf, Rectangle rect, int bpp) throws TesseractException {
- return doOCR(xsize, ysize, buf, null, rect, bpp);
- }
-
- /**
- * Performs OCR operation. Use SetImage
, (optionally)
- * SetRectangle
, and one or more of the Get*Text
- * functions.
- *
- * @param xsize width of image
- * @param ysize height of image
- * @param buf pixel data
- * @param filename input file name. Needed only for training and reading a
- * UNLV zone file.
- * @param rect the bounding rectangle defines the region of the image to be
- * recognized. A rectangle of zero dimension or null
indicates
- * the whole image.
- * @param bpp bits per pixel, represents the bit depth of the image, with 1
- * for binary bitmap, 8 for gray, and 24 for color RGB.
- * @return the recognized text
- * @throws TesseractException
- */
- @Override
- public String doOCR(int xsize, int ysize, ByteBuffer buf, String filename, Rectangle rect, int bpp) throws TesseractException {
- init();
- setTessVariables();
-
- try {
- setImage(xsize, ysize, buf, rect, bpp);
- return getOCRText(filename, 1);
- } catch (Exception e) {
- logger.error(e.getMessage(), e);
- throw new TesseractException(e);
- } finally {
- dispose();
- }
- }
-
- /**
- * Initializes Tesseract engine.
- */
- protected void init() {
- handle = TessBaseAPICreate();
- StringArray sarray = new StringArray(configList.toArray(new String[0]));
- PointerByReference configs = new PointerByReference();
- configs.setPointer(sarray);
- TessBaseAPIInit1(handle, datapath, language, ocrEngineMode, configs, configList.size());
- if (psm > -1) {
- TessBaseAPISetPageSegMode(handle, psm);
- }
- }
-
- /**
- * Sets Tesseract's internal parameters.
- */
- protected void setTessVariables() {
- Enumeration> em = prop.propertyNames();
- while (em.hasMoreElements()) {
- String key = (String) em.nextElement();
- TessBaseAPISetVariable(handle, key, prop.getProperty(key));
- }
- }
-
- /**
- * A wrapper for {@link #setImage(int, int, ByteBuffer, Rectangle, int)}.
- *
- * @param image a rendered image
- * @param rect region of interest
- * @throws java.io.IOException
- */
- protected void setImage(RenderedImage image, Rectangle rect) throws IOException {
- setImage(image.getWidth(), image.getHeight(), ImageIOHelper.getImageByteBuffer(image), rect, image
- .getColorModel().getPixelSize());
- }
-
- /**
- * Sets image to be processed.
- *
- * @param xsize width of image
- * @param ysize height of image
- * @param buf pixel data
- * @param rect the bounding rectangle defines the region of the image to be
- * recognized. A rectangle of zero dimension or null
indicates
- * the whole image.
- * @param bpp bits per pixel, represents the bit depth of the image, with 1
- * for binary bitmap, 8 for gray, and 24 for color RGB.
- */
- protected void setImage(int xsize, int ysize, ByteBuffer buf, Rectangle rect, int bpp) {
- int bytespp = bpp / 8;
- int bytespl = (int) Math.ceil(xsize * bpp / 8.0);
- TessBaseAPISetImage(handle, buf, xsize, ysize, bytespp, bytespl);
-
- if (rect != null && !rect.isEmpty()) {
- TessBaseAPISetRectangle(handle, rect.x, rect.y, rect.width, rect.height);
- }
- }
-
- /**
- * Gets recognized text.
- *
- * @param filename input file name. Needed only for reading a UNLV zone
- * file.
- * @param pageNum page number; needed for hocr paging.
- * @return the recognized text
- */
- protected String getOCRText(String filename, int pageNum) {
- if (filename != null && !filename.isEmpty()) {
- TessBaseAPISetInputName(handle, filename);
- }
-
- Pointer utf8Text = renderedFormat == RenderedFormat.HOCR ? TessBaseAPIGetHOCRText(handle, pageNum - 1) : TessBaseAPIGetUTF8Text(handle);
- String str = utf8Text.getString(0);
- TessDeleteText(utf8Text);
- return str;
- }
-
- /**
- * Creates renderers for given formats.
- *
- * @param outputbase
- * @param formats
- * @return
- */
- private TessResultRenderer createRenderers(String outputbase, ListRectangle
- * @throws TesseractException
- */
- @Override
- public ListWord
- */
- @Override
- public ListBufferedImage.getSubimage
- * method.
- *
- * @param image
- * @param x the X coordinate of the upper-left corner of the specified
- * rectangular region
- * @param y the Y coordinate of the upper-left corner of the specified
- * rectangular region
- * @param width the width of the specified rectangular region
- * @param height the height of the specified rectangular region
- * @return a BufferedImage that is the subimage of image
.
- */
- public static BufferedImage getSubImage(BufferedImage image, int x, int y, int width, int height) {
- int type = (image.getTransparency() == Transparency.OPAQUE)
- ? BufferedImage.TYPE_INT_RGB : BufferedImage.TYPE_INT_ARGB;
- BufferedImage tmp = new BufferedImage(width, height, type);
- Graphics2D g2 = tmp.createGraphics();
- g2.drawImage(image.getSubimage(x, y, width, height), 0, 0, null);
- g2.dispose();
- return tmp;
- }
-
- /**
- * A simple method to convert an image to binary or B/W image.
- *
- * @param image input image
- * @return a monochrome image
- */
- public static BufferedImage convertImageToBinary(BufferedImage image) {
- BufferedImage tmp = new BufferedImage(image.getWidth(), image.getHeight(), BufferedImage.TYPE_BYTE_BINARY);
- Graphics2D g2 = tmp.createGraphics();
- g2.drawImage(image, 0, 0, null);
- g2.dispose();
- return tmp;
- }
-
- /**
- * A simple method to convert an image to binary or B/W image.
- *
- * @param image input image
- * @return a monochrome image
- * @deprecated As of release 1.1, renamed to
- * {@link #convertImageToBinary(BufferedImage image)}
- */
- @Deprecated
- public static BufferedImage convertImage2Binary(BufferedImage image) {
- return convertImageToBinary(image);
- }
-
- /**
- * A simple method to convert an image to gray scale.
- *
- * @param image input image
- * @return a monochrome image
- */
- public static BufferedImage convertImageToGrayscale(BufferedImage image) {
- BufferedImage tmp = new BufferedImage(image.getWidth(), image.getHeight(), BufferedImage.TYPE_BYTE_GRAY);
- Graphics2D g2 = tmp.createGraphics();
- g2.drawImage(image, 0, 0, null);
- g2.dispose();
- return tmp;
- }
-
- private static final short[] invertTable;
-
- static {
- invertTable = new short[256];
- for (int i = 0; i < 256; i++) {
- invertTable[i] = (short) (255 - i);
- }
- }
-
- /**
- * Inverts image color.
- *
- * @param image input image
- * @return an inverted-color image
- */
- public static BufferedImage invertImageColor(BufferedImage image) {
- BufferedImage tmp = new BufferedImage(image.getWidth(), image.getHeight(), image.getType());
- BufferedImageOp invertOp = new LookupOp(new ShortLookupTable(0, invertTable), null);
- return invertOp.filter(image, tmp);
- }
-
- /**
- * Rotates an image.
- *
- * @param image the original image
- * @param angle the degree of rotation
- * @return a rotated image
- */
- public static BufferedImage rotateImage(BufferedImage image, double angle) {
- double theta = Math.toRadians(angle);
- double sin = Math.abs(Math.sin(theta));
- double cos = Math.abs(Math.cos(theta));
- int w = image.getWidth();
- int h = image.getHeight();
- int newW = (int) Math.floor(w * cos + h * sin);
- int newH = (int) Math.floor(h * cos + w * sin);
-
- BufferedImage tmp = new BufferedImage(newW, newH, image.getType());
- Graphics2D g2d = tmp.createGraphics();
- g2d.setRenderingHint(RenderingHints.KEY_INTERPOLATION,
- RenderingHints.VALUE_INTERPOLATION_BICUBIC);
- g2d.translate((newW - w) / 2, (newH - h) / 2);
- g2d.rotate(theta, w / 2, h / 2);
- g2d.drawImage(image, 0, 0, null);
- g2d.dispose();
- return tmp;
- }
-
- /**
- * Gets an image from Clipboard.
- *
- * @return image
- */
- public static Image getClipboardImage() {
- Clipboard clipboard = Toolkit.getDefaultToolkit().getSystemClipboard();
- try {
- return (Image) clipboard.getData(DataFlavor.imageFlavor);
- } catch (Exception e) {
- return null;
- }
- }
-
- /**
- * Clones an image.
- * http://stackoverflow.com/questions/3514158/how-do-you-clone-a-bufferedimage
- *
- * @param bi
- * @return
- */
- public static BufferedImage cloneImage(BufferedImage bi) {
- ColorModel cm = bi.getColorModel();
- boolean isAlphaPremultiplied = cm.isAlphaPremultiplied();
- WritableRaster raster = bi.copyData(null);
- return new BufferedImage(cm, raster, isAlphaPremultiplied, null);
- }
-}
diff --git a/Tess4J/src/net/sourceforge/tess4j/util/ImageIOHelper.java b/Tess4J/src/net/sourceforge/tess4j/util/ImageIOHelper.java
deleted file mode 100644
index e6bb7cc..0000000
--- a/Tess4J/src/net/sourceforge/tess4j/util/ImageIOHelper.java
+++ /dev/null
@@ -1,643 +0,0 @@
-/**
- * Copyright @ 2008 Quan Nguyen
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package net.sourceforge.tess4j.util;
-
-import java.io.*;
-
-import java.util.*;
-import javax.imageio.*;
-import javax.imageio.stream.*;
-import javax.imageio.metadata.*;
-import java.awt.Toolkit;
-import java.awt.image.*;
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
-
-import org.w3c.dom.NodeList;
-
-import com.github.jaiimageio.plugins.tiff.*;
-import com.recognition.software.jdeskew.ImageDeskew;
-import com.recognition.software.jdeskew.ImageUtil;
-import org.apache.commons.io.FilenameUtils;
-
-public class ImageIOHelper {
-
- final static String OUTPUT_FILE_NAME = "Tesstmp";
- final static String TIFF_EXT = ".tif";
- final static String TIFF_FORMAT = "tiff";
- final static String JAI_IMAGE_WRITER_MESSAGE = "Need to install JAI Image I/O package.\nhttps://java.net/projects/jai-imageio/";
- final static String JAI_IMAGE_READER_MESSAGE = "Unsupported image format. May need to install JAI Image I/O package.\nhttps://java.net/projects/jai-imageio/";
-
- /**
- * Creates a list of TIFF image files from an image file. It basically
- * converts images of other formats to TIFF format, or a multi-page TIFF
- * image to multiple TIFF image files.
- *
- * @param imageFile input image file
- * @param index an index of the page; -1 means all pages, as in a multi-page
- * TIFF image
- * @return a list of TIFF image files
- * @throws IOException
- */
- public static ListIIOImage
- * objects.
- *
- * @param imageList a list of IIOImage
objects
- * @param index an index of the page; -1 means all pages
- * @return a list of TIFF image files
- * @throws IOException
- */
- public static ListIIOImage
object.
- *
- * @param image an IIOImage
object
- * @return a byte buffer of pixel data
- * @throws IOException
- */
- public static ByteBuffer getImageByteBuffer(IIOImage image) throws IOException {
- return getImageByteBuffer(image.getRenderedImage());
- }
-
- /**
- * Gets pixel data of an RenderedImage
object.
- *
- * @param image an RenderedImage
object
- * @return a byte buffer of pixel data
- * @throws IOException
- */
- public static ByteBuffer getImageByteBuffer(RenderedImage image) throws IOException {
- //Set up the writeParam
- TIFFImageWriteParam tiffWriteParam = new TIFFImageWriteParam(Locale.US);
- tiffWriteParam.setCompressionMode(ImageWriteParam.MODE_DISABLED);
-
- //Get tif writer and set output to file
- IteratorBufferedImage
to ByteBuffer
.
- *
- * @param bi Input image
- * @return pixel data
- */
- public static ByteBuffer convertImageData(BufferedImage bi) {
- DataBuffer buff = bi.getRaster().getDataBuffer();
- // ClassCastException thrown if buff not instanceof DataBufferByte because raster data is not necessarily bytes.
- // Convert the original buffered image to grayscale.
- if (!(buff instanceof DataBufferByte)) {
- bi = ImageHelper.convertImageToGrayscale(bi);
- buff = bi.getRaster().getDataBuffer();
- }
- byte[] pixelData = ((DataBufferByte) buff).getData();
- // return ByteBuffer.wrap(pixelData);
- ByteBuffer buf = ByteBuffer.allocateDirect(pixelData.length);
- buf.order(ByteOrder.nativeOrder());
- buf.put(pixelData);
- buf.flip();
- return buf;
- }
-
- /**
- * Gets a list of BufferedImage
objects for an image file.
- *
- * @param imageFile input image file. It can be any of the supported
- * formats, including TIFF, JPEG, GIF, PNG, BMP, JPEG
- * @return a list of BufferedImage
objects
- * @throws IOException
- */
- public static ListIIOImage
objects for an image file.
- *
- * @param imageFile input image file. It can be any of the supported
- * formats, including TIFF, JPEG, GIF, PNG, BMP, JPEG, and PDF if GPL
- * Ghostscript is installed
- * @return a list of IIOImage
objects
- * @throws IOException
- */
- public static ListIIOImage
objects for a
- * BufferedImage
.
- *
- * @param bi input image
- * @return a list of IIOImage
objects
- * @throws IOException
- */
- public static ListBufferedImage
- * @param outputTiff the output TIFF file
- * @throws IOException
- */
- public static void mergeTiff(BufferedImage[] inputImages, File outputTiff) throws IOException {
- mergeTiff(inputImages, outputTiff, null);
- }
-
- /**
- * Merges multiple images into one multi-page TIFF image.
- *
- * @param inputImages an array of BufferedImage
- * @param outputTiff the output TIFF file
- * @param compressionType valid values: LZW, CCITT T.6, PackBits
- * @throws IOException
- */
- public static void mergeTiff(BufferedImage[] inputImages, File outputTiff, String compressionType) throws IOException {
- ListIIOImage
objects
- * @param outputTiff the output TIFF file
- * @throws IOException
- */
- public static void mergeTiff(ListIIOImage
objects
- * @param outputTiff the output TIFF file
- * @param compressionType valid values: LZW, CCITT T.6, PackBits
- * @throws IOException
- */
- public static void mergeTiff(ListNative.loadLibrary()
.
- */
- public static TessAPI getTessAPIInstance() {
- return (TessAPI) Native.loadLibrary(getTesseractLibName(), TessAPI.class);
- }
-
- /**
- * Gets native library name.
- *
- * @return the name of the tesseract library to be loaded using the
- * Native.register()
.
- */
- public static String getTesseractLibName() {
- return Platform.isWindows() ? LIB_NAME : LIB_NAME_NON_WIN;
- }
-
- /**
- * Extracts tesseract resources to temp folder.
- *
- * @param resourceName name of file or directory
- * @return target path, which could be file or directory
- */
- public static synchronized File extractTessResources(String resourceName) {
- File targetPath = null;
-
- try {
- targetPath = new File(TESS4J_TEMP_DIR, resourceName);
-
- EnumerationSystem.setProperty(PDF_LIBRARY, PDFBOX);
to set PDFBox as
- * default.
- */
-public class PdfUtilities {
-
- public static final String PDF_LIBRARY = "pdf.library";
- public static final String PDFBOX = "pdfbox";
-
- /**
- * Converts PDF to TIFF format.
- *
- * @param inputPdfFile input file
- * @return a multi-page TIFF image
- * @throws IOException
- */
- public static File convertPdf2Tiff(File inputPdfFile) throws IOException {
- if (PDFBOX.equals(System.getProperty(PDF_LIBRARY))) {
- return PdfBoxUtilities.convertPdf2Tiff(inputPdfFile);
- } else {
- try {
- return PdfGsUtilities.convertPdf2Tiff(inputPdfFile);
- } catch (Exception e) {
- System.setProperty(PDF_LIBRARY, PDFBOX);
- return convertPdf2Tiff(inputPdfFile);
- }
- }
- }
-
- /**
- * Converts PDF to PNG format.
- *
- * @param inputPdfFile input file
- * @return an array of PNG images
- * @throws java.io.IOException
- */
- public static File[] convertPdf2Png(File inputPdfFile) throws IOException {
- if (PDFBOX.equals(System.getProperty(PDF_LIBRARY))) {
- return PdfBoxUtilities.convertPdf2Png(inputPdfFile);
- } else {
- try {
- return PdfGsUtilities.convertPdf2Png(inputPdfFile);
- } catch (Exception e) {
- System.setProperty(PDF_LIBRARY, PDFBOX);
- return convertPdf2Png(inputPdfFile);
- }
- }
- }
-
- /**
- * Splits PDF.
- *
- * @deprecated As of Release 3.0.
- *
- * @param inputPdfFile input file
- * @param outputPdfFile output file
- * @param firstPage begin page
- * @param lastPage end page
- */
- public static void splitPdf(String inputPdfFile, String outputPdfFile, String firstPage, String lastPage) {
- if (firstPage.trim().isEmpty()) {
- firstPage = "0";
- }
- if (lastPage.trim().isEmpty()) {
- lastPage = "0";
- }
-
- splitPdf(new File(inputPdfFile), new File(outputPdfFile), Integer.parseInt(firstPage), Integer.parseInt(lastPage));
- }
-
- /**
- * Splits PDF.
- *
- * @param inputPdfFile input file
- * @param outputPdfFile output file
- * @param firstPage begin page
- * @param lastPage end page
- */
- public static void splitPdf(File inputPdfFile, File outputPdfFile, int firstPage, int lastPage) {
- if (PDFBOX.equals(System.getProperty(PDF_LIBRARY))) {
- PdfBoxUtilities.splitPdf(inputPdfFile, outputPdfFile, firstPage, lastPage);
- } else {
- try {
- PdfGsUtilities.splitPdf(inputPdfFile, outputPdfFile, firstPage, lastPage);
- } catch (Exception e) {
- System.setProperty(PDF_LIBRARY, PDFBOX);
- splitPdf(inputPdfFile, outputPdfFile, firstPage, lastPage);
- }
- }
- }
-
- /**
- * Gets PDF Page Count.
- *
- * @deprecated As of Release 3.0.
- *
- * @param inputPdfFile input file
- * @return number of pages
- */
- public static int getPdfPageCount(String inputPdfFile) {
- return getPdfPageCount(new File(inputPdfFile));
- }
-
- /**
- * Gets PDF Page Count.
- *
- * @param inputPdfFile input file
- * @return number of pages
- */
- public static int getPdfPageCount(File inputPdfFile) {
- if (PDFBOX.equals(System.getProperty(PDF_LIBRARY))) {
- return PdfBoxUtilities.getPdfPageCount(inputPdfFile);
- } else {
- try {
- return PdfGsUtilities.getPdfPageCount(inputPdfFile);
- } catch (Exception e) {
- System.setProperty(PDF_LIBRARY, PDFBOX);
- return getPdfPageCount(inputPdfFile);
- }
- }
- }
-
- /**
- * Merges PDF files.
- *
- * @param inputPdfFiles array of input files
- * @param outputPdfFile output file
- */
- public static void mergePdf(File[] inputPdfFiles, File outputPdfFile) {
- if (PDFBOX.equals(System.getProperty(PDF_LIBRARY))) {
- PdfBoxUtilities.mergePdf(inputPdfFiles, outputPdfFile);
- } else {
- try {
- PdfGsUtilities.mergePdf(inputPdfFiles, outputPdfFile);
- } catch (Exception e) {
- System.setProperty(PDF_LIBRARY, PDFBOX);
- mergePdf(inputPdfFiles, outputPdfFile);
- }
- }
- }
-}
diff --git a/Tess4J/src/net/sourceforge/tess4j/util/Utils.java b/Tess4J/src/net/sourceforge/tess4j/util/Utils.java
deleted file mode 100644
index ce42d78..0000000
--- a/Tess4J/src/net/sourceforge/tess4j/util/Utils.java
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright @ 2013 Quan Nguyen
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package net.sourceforge.tess4j.util;
-
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.lang.reflect.Field;
-import java.lang.reflect.Modifier;
-
-public class Utils {
-
- /**
- * Writes byte array to file.
- *
- * @param data byte array
- * @param outFile output file
- * @throws IOException
- */
- public static void writeFile(byte[] data, File outFile) throws IOException {
- FileOutputStream fos = null;
-
- try {
- // create parent dirs when necessary
- if (outFile.getParentFile() != null) {
- outFile.getParentFile().mkdirs();
- }
-
- fos = new FileOutputStream(outFile);
- fos.write(data);
- } finally {
- if (fos != null) {
- fos.close();
- }
- }
- }
-
- /**
- * Gets user-friendly name of the public static final constant defined in a
- * class or an interface for display purpose.
- *
- * @param value the constant value
- * @param c type of class or interface
- * @return name
- */
- public static String getConstantName(Object value, Class c) {
- for (Field f : c.getDeclaredFields()) {
- int mod = f.getModifiers();
- if (Modifier.isStatic(mod) && Modifier.isPublic(mod) && Modifier.isFinal(mod)) {
- try {
- if (f.get(null).equals(value)) {
- return f.getName();
- }
- } catch (IllegalAccessException e) {
- return String.valueOf(value);
- }
- }
- }
- return String.valueOf(value);
- }
-}
diff --git a/Tess4J/tessdata/configs/api_config b/Tess4J/tessdata/configs/api_config
deleted file mode 100644
index 5cd6ec0..0000000
--- a/Tess4J/tessdata/configs/api_config
+++ /dev/null
@@ -1 +0,0 @@
-tessedit_zero_rejection T
diff --git a/Tess4J/tessdata/configs/digits b/Tess4J/tessdata/configs/digits
deleted file mode 100644
index 6a329f8..0000000
--- a/Tess4J/tessdata/configs/digits
+++ /dev/null
@@ -1 +0,0 @@
-tessedit_char_whitelist 0123456789-.
diff --git a/Tess4J/tessdata/configs/hocr b/Tess4J/tessdata/configs/hocr
deleted file mode 100644
index 72f83e8..0000000
--- a/Tess4J/tessdata/configs/hocr
+++ /dev/null
@@ -1 +0,0 @@
-tessedit_create_hocr 1
\ No newline at end of file
diff --git a/Tess4J/tessdata/eng.traineddata b/Tess4J/tessdata/eng.traineddata
deleted file mode 100644
index 561883f..0000000
Binary files a/Tess4J/tessdata/eng.traineddata and /dev/null differ
diff --git a/Tess4J/tessdata/osd.traineddata b/Tess4J/tessdata/osd.traineddata
deleted file mode 100644
index 527457c..0000000
Binary files a/Tess4J/tessdata/osd.traineddata and /dev/null differ
diff --git a/Tess4J/tessdata/pdf.ttf b/Tess4J/tessdata/pdf.ttf
deleted file mode 100644
index eb359b3..0000000
Binary files a/Tess4J/tessdata/pdf.ttf and /dev/null differ
diff --git a/Tess4J/tessdata/pdf.ttx b/Tess4J/tessdata/pdf.ttx
deleted file mode 100644
index c6db1c8..0000000
--- a/Tess4J/tessdata/pdf.ttx
+++ /dev/null
@@ -1,793 +0,0 @@
-
-tessdata
folder into a temp folder.
- */
- logger.info("Loading the tessdata folder into a temporary folder.");
- tessDataFolder = LoadLibs.extractTessResources("tessdata");
-
- /**
- * Gets tesseract instance and sets data path.
- */
- ITesseract instance = new Tesseract();
-
- if (tessDataFolder != null) {
- logger.info(tessDataFolder.getAbsolutePath());
- instance.setDatapath(tessDataFolder.getParent());
- }
-
- /**
- * Performs OCR on the image.
- */
- String result = instance.doOCR(imageFile);
- logger.info(result);
-
- } catch (TesseractException e) {
- logger.error(e.getMessage());
- logger.error(e.getMessage(), e);
- } catch (URISyntaxException e) {
- logger.error(e.getMessage(), e);
- }
-
- // checks if tessdata folder exists
- assertTrue(tessDataFolder != null && tessDataFolder.exists());
- }
-
-}
diff --git a/Tess4J/test/net/sourceforge/tess4j/util/PdfUtilitiesTest.java b/Tess4J/test/net/sourceforge/tess4j/util/PdfUtilitiesTest.java
deleted file mode 100644
index ff738ef..0000000
--- a/Tess4J/test/net/sourceforge/tess4j/util/PdfUtilitiesTest.java
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright 2014 Quan Nguyen.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package net.sourceforge.tess4j.util;
-
-import java.io.File;
-import java.io.IOException;
-import org.junit.After;
-import org.junit.AfterClass;
-import org.junit.Before;
-import org.junit.BeforeClass;
-import org.junit.Test;
-import static org.junit.Assert.*;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public class PdfUtilitiesTest {
-
- private static final Logger logger = LoggerFactory.getLogger(new LoggHelper().toString());
- private final String TEST_RESOURCES_DATA_PATH = "test/resources/test-data";
-
- @BeforeClass
- public static void setUpClass() {
- }
-
- @AfterClass
- public static void tearDownClass() {
- }
-
- @Before
- public void setUp() {
- System.setProperty(PdfUtilities.PDF_LIBRARY, PdfUtilities.PDFBOX); // Note: comment out to test Ghostscript
- }
-
- @After
- public void tearDown() {
- }
-
- /**
- * Test of convertPdf2Tiff method, of class PdfUtilities.
- *
- * @throws java.lang.Exception
- */
- @Test
- public void testConvertPdf2Tiff() throws Exception {
- logger.info("convertPdf2Tiff");
- File inputPdfFile = new File(TEST_RESOURCES_DATA_PATH, "eurotext.pdf");
- File result = PdfUtilities.convertPdf2Tiff(inputPdfFile);
- result.deleteOnExit();
- assertTrue(result.exists());
- }
-
- /**
- * Test of convertPdf2Png method, of class PdfUtilities.
- *
- * @throws java.io.IOException
- */
- @Test
- public void testConvertPdf2Png() throws IOException {
- logger.info("convertPdf2Png");
- File inputPdfFile = new File(TEST_RESOURCES_DATA_PATH, "eurotext.pdf");
- File[] results = PdfUtilities.convertPdf2Png(inputPdfFile);
- assertTrue(results.length > 0);
-
- //clean up
- File parentDir = results[0].getParentFile();
- for (File result : results) {
- result.delete();
- }
- parentDir.delete();
- }
-
- /**
- * Test of splitPdf method, of class PdfUtilities.
- */
- @Test
- public void testSplitPdf() {
- logger.info("splitPdf");
- File inputPdfFile = new File(TEST_RESOURCES_DATA_PATH, "multipage-pdf.pdf");
- File outputPdfFile = new File("test/test-results/multipage-pdf_splitted.pdf");
- int startPage = 2;
- int endPage = 3;
- int expResult = 2;
- PdfUtilities.splitPdf(inputPdfFile, outputPdfFile, startPage, endPage);
- int pageCount = PdfUtilities.getPdfPageCount(outputPdfFile);
- assertEquals(expResult, pageCount);
- }
-
- /**
- * Test of getPdfPageCount method, of class PdfUtilities.
- */
- @Test
- public void testGetPdfPageCount() {
- logger.info("getPdfPageCount");
- File inputPdfFile = new File(TEST_RESOURCES_DATA_PATH, "multipage-pdf.pdf");
- int expResult = 5;
- int result = PdfUtilities.getPdfPageCount(inputPdfFile);
- assertEquals(expResult, result);
- }
-
- /**
- * Test of mergePdf method, of class PdfUtilities.
- */
- @Test
- public void testMergePdf() {
- logger.info("mergePdf");
- File pdfPartOne = new File(TEST_RESOURCES_DATA_PATH, "eurotext.pdf");
- File pdfPartTwo = new File(TEST_RESOURCES_DATA_PATH, "multipage-pdf.pdf");
- int expResult = 6;
- File outputPdfFile = new File("test/test-results", "multipage-pdf_merged.pdf");
- File[] inputPdfFiles = {pdfPartOne, pdfPartTwo};
- PdfUtilities.mergePdf(inputPdfFiles, outputPdfFile);
- assertEquals(expResult, PdfUtilities.getPdfPageCount(outputPdfFile));
- }
-
-}
diff --git a/Tess4J/test/resources/test-data/eurotext.bmp b/Tess4J/test/resources/test-data/eurotext.bmp
deleted file mode 100644
index be05080..0000000
Binary files a/Tess4J/test/resources/test-data/eurotext.bmp and /dev/null differ
diff --git a/Tess4J/test/resources/test-data/eurotext.pdf b/Tess4J/test/resources/test-data/eurotext.pdf
deleted file mode 100644
index eac4388..0000000
Binary files a/Tess4J/test/resources/test-data/eurotext.pdf and /dev/null differ
diff --git a/Tess4J/test/resources/test-data/eurotext.png b/Tess4J/test/resources/test-data/eurotext.png
deleted file mode 100644
index e5c324e..0000000
Binary files a/Tess4J/test/resources/test-data/eurotext.png and /dev/null differ
diff --git a/Tess4J/test/resources/test-data/eurotext.tif b/Tess4J/test/resources/test-data/eurotext.tif
deleted file mode 100644
index 92791da..0000000
Binary files a/Tess4J/test/resources/test-data/eurotext.tif and /dev/null differ
diff --git a/Tess4J/test/resources/test-data/eurotext_deskew.png b/Tess4J/test/resources/test-data/eurotext_deskew.png
deleted file mode 100644
index dfa06bd..0000000
Binary files a/Tess4J/test/resources/test-data/eurotext_deskew.png and /dev/null differ
diff --git a/Tess4J/test/resources/test-data/eurotext_unlv.png b/Tess4J/test/resources/test-data/eurotext_unlv.png
deleted file mode 100644
index e5c324e..0000000
Binary files a/Tess4J/test/resources/test-data/eurotext_unlv.png and /dev/null differ
diff --git a/Tess4J/test/resources/test-data/eurotext_unlv.uzn b/Tess4J/test/resources/test-data/eurotext_unlv.uzn
deleted file mode 100644
index 878c8de..0000000
--- a/Tess4J/test/resources/test-data/eurotext_unlv.uzn
+++ /dev/null
@@ -1,3 +0,0 @@
-97 162 747 50 ThirdLine
-97 209 828 55 FourthLine
-92 56 810 107 First2Lines
\ No newline at end of file
diff --git a/Tess4J/test/resources/test-data/multipage-pdf.pdf b/Tess4J/test/resources/test-data/multipage-pdf.pdf
deleted file mode 100644
index fa1eade..0000000
Binary files a/Tess4J/test/resources/test-data/multipage-pdf.pdf and /dev/null differ
diff --git a/Tess4J/versionchanges.txt b/Tess4J/versionchanges.txt
deleted file mode 100644
index 503e438..0000000
--- a/Tess4J/versionchanges.txt
+++ /dev/null
@@ -1,137 +0,0 @@
-Tess4J Change Summary
-
-Version 0.1 - initial release (14 Aug 2010):
-- Java JNA-based wrapper for Tesseract OCR DLL 2.04
-- Support uncompressed, binary TIFF images
-
-Version 0.2 (16 Aug 2010):
-- Add support for more image formats (PNG, BMP, GIF, PDF, JPEG)
-- Add support for compressed, grayscale and colored images
-
-Version 0.3 (22 Aug 2010):
-- Include API support for BufferedImage
-- Clean up codes. Remove unsupported API and files
-- Document the API
-
-Version 0.3.1 (26 Aug 2010):
-- Send only pixel data, not whole image data, to Tesseract engine, to fix a bug that has erroneously put some words at beginning of line towards end of line
-
-Version 0.4 (1 Nov 2010):
-- Add JNA Direct Mapping calls, which can provide performance near that of custom JNI
-
-Version 1.0 (30 October 2012):
-- Upgrade to Tesseract 3.02 (r798), which is not backward compatible with Tesseract 2.04.
-- Implement a new JNA wrapper for the new Tesseract OCR API
-- Add more unit test cases
-- Update documentation
-
-Version 1.1 (3 March 2013)
-- Update Tesseract DLL to r828
-- Additional API methods, image helper methods, and unit test cases
-- Improve handling of Unicode character encoding
-- Fix memory leaks
-- Add support for determining skew angle and image rotation
-
-Version 1.2 (22 September 2013)
-- Update Tesseract DLL to r866
-- More efficient OCR of multiple images
-- Various minor improvements
-- Update JNA to v4.0
-
-Version 1.3 (31 May 2014)
-- Update JNA to v4.1.0
-- Update Ghost4J to v0.5.1
-- Refactoring
-- Bundle Tesseract and Leptonica 64-bit DLLs
-
-Version 1.4 (18 January 2015)
-- Refactor to reduce code duplication
-- Embed Windows native resources in JAR
-- Autoload Windows native libraries
-
-Version 1.4.1 (24 January 2015)
-- Enable use of jna.library.path system property for user-customizable path
-
-Version 1.5 (13 March 2015)
-- Add UNLV zone file support
-- Refactor
-
-Version 2.0 (29 March 2015)
-- Upgrade to Tesseract 3.03 (r1050), which is compatible with Tesseract 3.03RC on Linux
-- Refactor Tesseract class for extensibility and thread-safety
-- Update English language data for Tesseract 3.02
-
-Version 3.0 (25 December 2015)
-- Upgrade to Tesseract 3.04 (953523b)
-- Include Lept4J library
-- Incorporate slf4j and logback libraries for logging
-- Make GhostScript calls thread safe
-
-Version 3.1 (21 March 2016)
-- Update Tesseract to 3.04.01 (4ef68a0)
-- Use Lept4J-1.1.2 (Leptonica 1.72)
-- Update JNA to 4.2.2
-- Update Ghost4J to 1.0.1
-- Delete ResultRenderer after use to release PDF file handler
-
-Version 3.2 (15 May 2016)
-- Revert JNA to 4.1.0 due to "Invalid calling convention 63" errors invoking GhostScript via Ghost4J on Linux
-- Update Lept4J to 1.2.2 (Leptonica 1.73)
-- Recompile Tesseract 3.04.01 DLL against Leptonica 1.73
-- Update GhostScript Windows binary to 9.19
-
-Version 3.2.1 (29 May 2016)
-- Properly release Box and Boxa resources
-- Update Lept4J to 1.2.3
-
-Version 3.2.2 (16 February 2017)
-- Update GhostScript to 9.20
-- Fix possible NPE with PDF-related codes
-- Update dependencies
-- Additional image utility methods
-
-Version 3.3.0 (16 February 2017)
-- Upgrade to Tesseract 3.05 (2ca5d0a)
-- Update Lept4J to 1.3.0 (Leptonica 1.74.1)
-
-Version 3.3.1 (23 March 2017)
-- Update Lept4J to 1.3.1
-- Update other dependencies
-
-Version 3.4.0 (1 June 2017)
-- Upgrade to Tesseract 3.05.01 (2158661)
-- Update Lept4J to 1.4.0
-- Add support for jboss-vfs protocol
-
-Version 3.4.1 (22 September 2017)
-- Not extract/copy native resource if it exists and has same file size
-- Update Tesseract 3.05.01 (e2e79c4); link against Leptonica 1.74.4
-- Update Lept4J to 1.6.1
-
-Version 3.4.2 (14 November 2017)
-- Update Lept4J to 1.6.2
-- Update GhostScript to 9.22
-- Improve handling of PDF files in multi-threaded environment
-- Lift limits on number of pages in PDF
-- Use TESSDATA_PREFIX environment variable by default, if defined
-
-Version 3.4.3 (14 January 2018)
-- Not extract/copy resource if it exists and has same file size
-
-Version 3.4.4 (22 February 2018)
-- Exclude logback.xml from JAR
-- Add image rotate and deskew methods
-- Update Lept4J to 1.6.3
-
-Version 3.4.5 (21 March 2018)
-- Remove GS DLL due to license incompatibility
-- Use PDFBox
-
-Version 3.4.6 (25 March 2018)
-- Update PDFBox dependencies
-
-Version 3.4.7 (16 April 2018)
-- Update jai-imageio-core to 1.4.0 for Java 9 fixes
-
-Version 3.4.8 (2 May 2018)
-- Fix a path issue when extracting resources from JAR to temp directory on Windows server
\ No newline at end of file