diff --git a/Tess4J/build.xml b/Tess4J/build.xml deleted file mode 100644 index bd1e903..0000000 --- a/Tess4J/build.xml +++ /dev/null @@ -1,116 +0,0 @@ - - - - - - - - - - - Builds, tests, and runs the project Tess4J. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/Tess4J/dist/README.TXT b/Tess4J/dist/README.TXT deleted file mode 100644 index f3e520d..0000000 --- a/Tess4J/dist/README.TXT +++ /dev/null @@ -1,32 +0,0 @@ -======================== -BUILD OUTPUT DESCRIPTION -======================== - -When you build an Java application project that has a main class, the IDE -automatically copies all of the JAR -files on the projects classpath to your projects dist/lib folder. The IDE -also adds each of the JAR files to the Class-Path element in the application -JAR files manifest file (MANIFEST.MF). - -To run the project from the command line, go to the dist folder and -type the following: - -java -jar "tess4j-3.4.8.jar" - -To distribute this project, zip up the dist folder (including the lib folder) -and distribute the ZIP file. - -Notes: - -* If two JAR files on the project classpath have the same name, only the first -JAR file is copied to the lib folder. -* Only JAR files are copied to the lib folder. -If the classpath contains other types of files or folders, these files (folders) -are not copied. -* If a library on the projects classpath also has a Class-Path element -specified in the manifest,the content of the Class-Path element has to be on -the projects runtime path. -* To set a main class in a standard Java project, right-click the project node -in the Projects window and choose Properties. Then click Run and enter the -class name in the Main Class field. Alternatively, you can manually type the -class name in the manifest Main-Class element. diff --git a/Tess4J/dist/tess4j-3.4.8.jar b/Tess4J/dist/tess4j-3.4.8.jar deleted file mode 100644 index e6790e9..0000000 Binary files a/Tess4J/dist/tess4j-3.4.8.jar and /dev/null differ diff --git a/Tess4J/eng.traineddata b/Tess4J/eng.traineddata deleted file mode 100644 index 561883f..0000000 Binary files a/Tess4J/eng.traineddata and /dev/null differ diff --git a/Tess4J/lib/commons-beanutils-1.9.2.jar b/Tess4J/lib/commons-beanutils-1.9.2.jar deleted file mode 100644 index 289461d..0000000 Binary files a/Tess4J/lib/commons-beanutils-1.9.2.jar and /dev/null differ diff --git a/Tess4J/lib/commons-io-2.6.jar b/Tess4J/lib/commons-io-2.6.jar deleted file mode 100644 index 00556b1..0000000 Binary files a/Tess4J/lib/commons-io-2.6.jar and /dev/null differ diff --git a/Tess4J/lib/commons-logging-1.2.jar b/Tess4J/lib/commons-logging-1.2.jar deleted file mode 100644 index 93a3b9f..0000000 Binary files a/Tess4J/lib/commons-logging-1.2.jar and /dev/null differ diff --git a/Tess4J/lib/fontbox-2.0.9.jar b/Tess4J/lib/fontbox-2.0.9.jar deleted file mode 100644 index dd348b1..0000000 Binary files a/Tess4J/lib/fontbox-2.0.9.jar and /dev/null differ diff --git a/Tess4J/lib/ghost4j-1.0.1.jar b/Tess4J/lib/ghost4j-1.0.1.jar deleted file mode 100644 index d8d7c32..0000000 Binary files a/Tess4J/lib/ghost4j-1.0.1.jar and /dev/null differ diff --git a/Tess4J/lib/hamcrest-core-1.3.jar b/Tess4J/lib/hamcrest-core-1.3.jar deleted file mode 100644 index 401097e..0000000 Binary files a/Tess4J/lib/hamcrest-core-1.3.jar and /dev/null differ diff --git a/Tess4J/lib/itext-2.1.7.jar b/Tess4J/lib/itext-2.1.7.jar deleted file mode 100644 index 3f2c188..0000000 Binary files a/Tess4J/lib/itext-2.1.7.jar and /dev/null differ diff --git a/Tess4J/lib/jai-imageio-core-1.4.0.jar b/Tess4J/lib/jai-imageio-core-1.4.0.jar deleted file mode 100644 index eb45949..0000000 Binary files a/Tess4J/lib/jai-imageio-core-1.4.0.jar and /dev/null differ diff --git a/Tess4J/lib/jbig2-imageio-3.0.0.jar b/Tess4J/lib/jbig2-imageio-3.0.0.jar deleted file mode 100644 index 8c5400b..0000000 Binary files a/Tess4J/lib/jbig2-imageio-3.0.0.jar and /dev/null differ diff --git a/Tess4J/lib/jboss-vfs-3.2.12.Final.jar b/Tess4J/lib/jboss-vfs-3.2.12.Final.jar deleted file mode 100644 index 8e8811f..0000000 Binary files a/Tess4J/lib/jboss-vfs-3.2.12.Final.jar and /dev/null differ diff --git a/Tess4J/lib/jcl-over-slf4j-1.7.25.jar b/Tess4J/lib/jcl-over-slf4j-1.7.25.jar deleted file mode 100644 index 8e7fec8..0000000 Binary files a/Tess4J/lib/jcl-over-slf4j-1.7.25.jar and /dev/null differ diff --git a/Tess4J/lib/jna-4.1.0.jar b/Tess4J/lib/jna-4.1.0.jar deleted file mode 100644 index b1a3922..0000000 Binary files a/Tess4J/lib/jna-4.1.0.jar and /dev/null differ diff --git a/Tess4J/lib/jul-to-slf4j-1.7.25.jar b/Tess4J/lib/jul-to-slf4j-1.7.25.jar deleted file mode 100644 index 98d9668..0000000 Binary files a/Tess4J/lib/jul-to-slf4j-1.7.25.jar and /dev/null differ diff --git a/Tess4J/lib/junit-4.12.jar b/Tess4J/lib/junit-4.12.jar deleted file mode 100644 index e703cde..0000000 Binary files a/Tess4J/lib/junit-4.12.jar and /dev/null differ diff --git a/Tess4J/lib/lept4j-1.6.4.jar b/Tess4J/lib/lept4j-1.6.4.jar deleted file mode 100644 index 463bf32..0000000 Binary files a/Tess4J/lib/lept4j-1.6.4.jar and /dev/null differ diff --git a/Tess4J/lib/log4j-1.2.17.jar b/Tess4J/lib/log4j-1.2.17.jar deleted file mode 100644 index 068867e..0000000 Binary files a/Tess4J/lib/log4j-1.2.17.jar and /dev/null differ diff --git a/Tess4J/lib/log4j-over-slf4j-1.7.25.jar b/Tess4J/lib/log4j-over-slf4j-1.7.25.jar deleted file mode 100644 index ba241a4..0000000 Binary files a/Tess4J/lib/log4j-over-slf4j-1.7.25.jar and /dev/null differ diff --git a/Tess4J/lib/logback-classic-1.2.3.jar b/Tess4J/lib/logback-classic-1.2.3.jar deleted file mode 100644 index bed00c0..0000000 Binary files a/Tess4J/lib/logback-classic-1.2.3.jar and /dev/null differ diff --git a/Tess4J/lib/logback-core-1.2.3.jar b/Tess4J/lib/logback-core-1.2.3.jar deleted file mode 100644 index 487b395..0000000 Binary files a/Tess4J/lib/logback-core-1.2.3.jar and /dev/null differ diff --git a/Tess4J/lib/nblibraries.properties b/Tess4J/lib/nblibraries.properties deleted file mode 100644 index dfce7b2..0000000 --- a/Tess4J/lib/nblibraries.properties +++ /dev/null @@ -1,11 +0,0 @@ -# To change this template, choose Tools | Templates -# and open the template in the editor. - -libs.hamcrest.classpath=\ - ${base}/hamcrest-core-1.3.jar -libs.hamcrest.displayName=Hamcrest 1.3 -libs.hamcrest.prop-maven-dependencies=org.hamcrest:hamcrest-core:1.3:jar -libs.junit_4.classpath=\ - ${base}/junit-4.12.jar -libs.junit_4.displayName=JUnit 4.12 -libs.junit_4.prop-maven-dependencies=junit:junit:4.12:jar diff --git a/Tess4J/lib/pdfbox-2.0.9.jar b/Tess4J/lib/pdfbox-2.0.9.jar deleted file mode 100644 index 6912ee8..0000000 Binary files a/Tess4J/lib/pdfbox-2.0.9.jar and /dev/null differ diff --git a/Tess4J/lib/pdfbox-tools-2.0.9.jar b/Tess4J/lib/pdfbox-tools-2.0.9.jar deleted file mode 100644 index 01dcc46..0000000 Binary files a/Tess4J/lib/pdfbox-tools-2.0.9.jar and /dev/null differ diff --git a/Tess4J/lib/slf4j-api-1.7.25.jar b/Tess4J/lib/slf4j-api-1.7.25.jar deleted file mode 100644 index 0143c09..0000000 Binary files a/Tess4J/lib/slf4j-api-1.7.25.jar and /dev/null differ diff --git a/Tess4J/lib/win32-x86-64/libtesseract3051.dll b/Tess4J/lib/win32-x86-64/libtesseract3051.dll deleted file mode 100644 index 6cd60e0..0000000 Binary files a/Tess4J/lib/win32-x86-64/libtesseract3051.dll and /dev/null differ diff --git a/Tess4J/lib/win32-x86/libtesseract3051.dll b/Tess4J/lib/win32-x86/libtesseract3051.dll deleted file mode 100644 index ddc5e08..0000000 Binary files a/Tess4J/lib/win32-x86/libtesseract3051.dll and /dev/null differ diff --git a/Tess4J/lib/xmlgraphics-commons-1.5.jar b/Tess4J/lib/xmlgraphics-commons-1.5.jar deleted file mode 100644 index 0ff3b4f..0000000 Binary files a/Tess4J/lib/xmlgraphics-commons-1.5.jar and /dev/null differ diff --git a/Tess4J/nbproject/build-impl.xml b/Tess4J/nbproject/build-impl.xml deleted file mode 100644 index 7386081..0000000 --- a/Tess4J/nbproject/build-impl.xml +++ /dev/null @@ -1,1793 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Must set src.dir - Must set test.src.dir - Must set build.dir - Must set dist.dir - Must set build.classes.dir - Must set dist.javadoc.dir - Must set build.test.classes.dir - Must set build.test.results.dir - Must set build.classes.excludes - Must set dist.jar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Must set javac.includes - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - No tests executed. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Must set JVM to use for profiling in profiler.info.jvm - Must set profiler agent JVM arguments in profiler.info.jvmargs.agent - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Must select some files in the IDE or set javac.includes - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - To run this application from the command line without Ant, try: - - java -jar "${dist.jar.resolved}" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Must select one file in the IDE or set run.class - - - - Must select one file in the IDE or set run.class - - - - - - - - - - - - - - - - - - - - - - - Must select one file in the IDE or set debug.class - - - - - Must select one file in the IDE or set debug.class - - - - - Must set fix.includes - - - - - - - - - - This target only works when run from inside the NetBeans IDE. - - - - - - - - - Must select one file in the IDE or set profile.class - This target only works when run from inside the NetBeans IDE. - - - - - - - - - This target only works when run from inside the NetBeans IDE. - - - - - - - - - - - - - This target only works when run from inside the NetBeans IDE. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Must select one file in the IDE or set run.class - - - - - - Must select some files in the IDE or set test.includes - - - - - Must select one file in the IDE or set run.class - - - - - Must select one file in the IDE or set applet.url - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Must select some files in the IDE or set javac.includes - - - - - - - - - - - - - - - - - - - - - - - - Some tests failed; see details above. - - - - - - - - - Must select some files in the IDE or set test.includes - - - - Some tests failed; see details above. - - - - Must select some files in the IDE or set test.class - Must select some method in the IDE or set test.method - - - - Some tests failed; see details above. - - - - - Must select one file in the IDE or set test.class - - - - Must select one file in the IDE or set test.class - Must select some method in the IDE or set test.method - - - - - - - - - - - - - - Must select one file in the IDE or set applet.url - - - - - - - - - Must select one file in the IDE or set applet.url - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/Tess4J/nbproject/genfiles.properties b/Tess4J/nbproject/genfiles.properties deleted file mode 100644 index 189448f..0000000 --- a/Tess4J/nbproject/genfiles.properties +++ /dev/null @@ -1,8 +0,0 @@ -build.xml.data.CRC32=1dc6a699 -build.xml.script.CRC32=f0eaf91d -build.xml.stylesheet.CRC32=28e38971@1.38.2.45 -# This file is used by a NetBeans-based IDE to track changes in generated files such as build-impl.xml. -# Do not edit this file. You may delete it but then the IDE will never regenerate such files for you. -nbproject/build-impl.xml.data.CRC32=1f03e186 -nbproject/build-impl.xml.script.CRC32=a52f4060 -nbproject/build-impl.xml.stylesheet.CRC32=3a2fa800@1.88.0.48 diff --git a/Tess4J/nbproject/private/config.properties b/Tess4J/nbproject/private/config.properties deleted file mode 100644 index e69de29..0000000 diff --git a/Tess4J/nbproject/private/private.properties b/Tess4J/nbproject/private/private.properties deleted file mode 100644 index 33e876b..0000000 --- a/Tess4J/nbproject/private/private.properties +++ /dev/null @@ -1,8 +0,0 @@ -compile.on.save=true -do.depend=true -do.jar=true -do.jlink=false -javac.debug=false -javadoc.preview=true -jlink.strip=false -user.properties.file=C:\\Users\\Quan\\AppData\\Roaming\\NetBeans\\dev\\build.properties diff --git a/Tess4J/nbproject/private/private.xml b/Tess4J/nbproject/private/private.xml deleted file mode 100644 index 6807a2b..0000000 --- a/Tess4J/nbproject/private/private.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - - diff --git a/Tess4J/nbproject/project.properties b/Tess4J/nbproject/project.properties deleted file mode 100644 index ae00480..0000000 --- a/Tess4J/nbproject/project.properties +++ /dev/null @@ -1,138 +0,0 @@ -annotation.processing.enabled=true -annotation.processing.enabled.in.editor=false -annotation.processing.processors.list= -annotation.processing.run.all.processors=true -annotation.processing.source.output=${build.generated.sources.dir}/ap-source-output -application.desc=A Java wrapper for Tesseract OCR API -application.homepage=http://tess4j.sf.net -application.title=Tess4J -application.vendor=Quan Nguyen -build.classes.dir=${build.dir}/classes -build.classes.excludes=**/*.java,**/*.form -# This directory is removed when the project is cleaned: -build.dir=build -build.generated.dir=${build.dir}/generated -build.generated.sources.dir=${build.dir}/generated-sources -# Only compile against the classpath explicitly listed here: -build.sysclasspath=ignore -build.test.classes.dir=${build.dir}/test/classes -build.test.results.dir=${build.dir}/test/results -# Uncomment to specify the preferred debugger connection transport: -#debug.transport=dt_socket -debug.classpath=\ - ${run.classpath} -debug.modulepath=\ - ${run.modulepath} -debug.test.classpath=\ - ${run.test.classpath} -debug.test.modulepath=\ - ${run.test.modulepath} -# This directory is removed when the project is cleaned: -dist.dir=dist -dist.jar=${dist.dir}/tess4j-${version}.jar -dist.javadoc.dir=${dist.dir}/javadoc -endorsed.classpath= -excludes= -file.reference.commons-beanutils-1.9.2.jar=lib/commons-beanutils-1.9.2.jar -file.reference.commons-io-2.6.jar=lib/commons-io-2.6.jar -file.reference.commons-logging-1.2.jar=lib/commons-logging-1.2.jar -file.reference.fontbox-2.0.9.jar=lib/fontbox-2.0.9.jar -file.reference.ghost4j-1.0.1.jar=lib/ghost4j-1.0.1.jar -file.reference.itext-2.1.7.jar=lib/itext-2.1.7.jar -file.reference.jai-imageio-core-1.4.0.jar=lib/jai-imageio-core-1.4.0.jar -file.reference.jbig2-imageio-3.0.0.jar=lib/jbig2-imageio-3.0.0.jar -file.reference.jboss-vfs-3.2.12.Final.jar=lib/jboss-vfs-3.2.12.Final.jar -file.reference.jna-4.1.0.jar=lib/jna-4.1.0.jar -file.reference.jul-to-slf4j-1.7.25.jar=lib/jul-to-slf4j-1.7.25.jar -file.reference.lept4j-1.6.4.jar=lib/lept4j-1.6.4.jar -file.reference.log4j-1.2.17.jar=lib/log4j-1.2.17.jar -file.reference.logback-classic-1.2.3.jar=lib/logback-classic-1.2.3.jar -file.reference.logback-core-1.2.3.jar=lib/logback-core-1.2.3.jar -file.reference.pdfbox-2.0.9.jar=lib/pdfbox-2.0.9.jar -file.reference.pdfbox-tools-2.0.9.jar=lib/pdfbox-tools-2.0.9.jar -file.reference.slf4j-api-1.7.25.jar=lib/slf4j-api-1.7.25.jar -includes=** -jar.archive.disabled=${jnlp.enabled} -jar.compress=false -jar.index=${jnlp.enabled} -javac.classpath=\ - ${file.reference.ghost4j-1.0.1.jar}:\ - ${file.reference.jna-4.1.0.jar}:\ - ${file.reference.log4j-1.2.17.jar}:\ - ${file.reference.itext-2.1.7.jar}:\ - ${file.reference.commons-beanutils-1.9.2.jar}:\ - ${file.reference.commons-logging-1.2.jar}:\ - ${file.reference.slf4j-api-1.7.25.jar}:\ - ${file.reference.jul-to-slf4j-1.7.25.jar}:\ - ${file.reference.commons-io-2.6.jar}:\ - ${file.reference.jboss-vfs-3.2.12.Final.jar}:\ - ${file.reference.logback-classic-1.2.3.jar}:\ - ${file.reference.logback-core-1.2.3.jar}:\ - ${file.reference.jai-imageio-core-1.4.0.jar}:\ - ${file.reference.lept4j-1.6.4.jar}:\ - ${file.reference.pdfbox-2.0.9.jar}:\ - ${file.reference.pdfbox-tools-2.0.9.jar}:\ - ${file.reference.fontbox-2.0.9.jar}:\ - ${file.reference.jbig2-imageio-3.0.0.jar} -# Space-separated list of extra javac options -javac.compilerargs= -javac.deprecation=true -javac.external.vm=false -javac.modulepath= -javac.processormodulepath= -javac.processorpath=\ - ${javac.classpath} -javac.source=1.7 -javac.target=1.7 -javac.test.classpath=\ - ${javac.classpath}:\ - ${build.classes.dir}:\ - ${libs.junit_4.classpath}:\ - ${libs.hamcrest.classpath} -javac.test.modulepath=\ - ${javac.modulepath} -javac.test.processorpath=\ - ${javac.test.classpath} -javadoc.additionalparam= -javadoc.author=false -javadoc.encoding=${source.encoding} -javadoc.html5=false -javadoc.noindex=false -javadoc.nonavbar=false -javadoc.notree=false -javadoc.private=false -javadoc.splitindex=true -javadoc.use=true -javadoc.version=false -javadoc.windowtitle=Tess4J API -jlink.launcher=false -jlink.launcher.name=Tess4J -jnlp.codebase.type=no.codebase -jnlp.descriptor=application -jnlp.enabled=false -jnlp.mixed.code=default -jnlp.offline-allowed=false -jnlp.signed=false -jnlp.signing= -jnlp.signing.alias= -jnlp.signing.keystore= -meta.inf.dir=${src.dir}/META-INF -mkdist.disabled=false -platform.active=default_platform -run.classpath=\ - ${javac.classpath}:\ - ${build.classes.dir} -# Space-separated list of JVM arguments used when running the project -# (you may also define separate properties like run-sys-prop.name=value instead of -Dname=value -# or test-sys-prop.name=value to set system properties for unit tests): -run.jvmargs= -run.modulepath=\ - ${javac.modulepath} -run.test.classpath=\ - ${javac.test.classpath}:\ - ${build.test.classes.dir} -run.test.modulepath=\ - ${javac.test.modulepath} -source.encoding=UTF-8 -src.dir=src -test.src.dir=test diff --git a/Tess4J/nbproject/project.xml b/Tess4J/nbproject/project.xml deleted file mode 100644 index e1c23bd..0000000 --- a/Tess4J/nbproject/project.xml +++ /dev/null @@ -1,19 +0,0 @@ - - - org.netbeans.modules.java.j2seproject - - - Tess4J - - - - - - - - - .\lib\nblibraries.properties - - - - diff --git a/Tess4J/readme.html b/Tess4J/readme.html deleted file mode 100644 index 7d7a340..0000000 --- a/Tess4J/readme.html +++ /dev/null @@ -1,124 +0,0 @@ - - - - Tess4J - Java Wrapper for Tesseract OCR API - - -
-

- Tess4J -

-

- DESCRIPTION -

-

- Tess4J is a JNA wrapper for Tesseract OCR - API; it provides character recognition support for common image formats, - multi-page images, and PDF documents. The library has been developed and tested - on Windows and Linux. -

-

- Tess4J is released and distributed under the - Apache License, v2.0. Its official homepage is at - http://tess4j.sourceforge.net. -

-

- SOFTWARE REQUIREMENTS -

-

- Java Runtime Environment, - JNA, and JAI-ImageIO - are required. Apache Ant and - JUnit are used for program building and unit testing. The Tesseract DLLs - were built with VS2015 and therefore depend on the - Visual C++ 2015 Redistributable Packages. -

-

- INSTRUCTIONS -

-

- Tesseract 3.05.01 and Leptonica 1.74.4 (via Lept4J) 32- and 64-bit - DLLs, language data for English, and sample images are bundled with the library. - Language data packs for - Tesseract should be decompressed and placed into the tessdata folder. -

-

- The Linux shared object library (libtesseract.so) equivalent to the - DLL is available in Tesseract 3.05.01, which can be built from the source with the instructions given in Tesseract Wiki. -

-

- To unit test, at the command line, execute: -

-
-

- ant test -

-
-

- Support for PDF documents is available through either - GPL Ghostscript, which should be installed and included - in system path, or PDFBox, if Ghostscript is not available. -

-

- Images to be OCRed should be scanned at resolution from at least 200 DPI (dot per - inch) to 400 DPI in monochrome (black&white) or grayscale. Scanning at higher - resolutions will not necessarily result in better recognition accuracy. The actual - success rates depend greatly on the quality of the scanned image. The typical settings - for scanning are 300 DPI and 1 bpp (bit per pixel) black&white or 8 bpp grayscale - uncompressed TIFF or PNG format. PNG is usually smaller in size than other image - formats and still keeps high quality due to its employing lossless data compression - algorithms; TIFF has the advantage of the ability to contain multiple images (pages) - in a file. -

-

- Several built-in functions are also provided for merging several images or PDF files - into a single one for convenient OCR operations, or for splitting a PDF file into - smaller ones if it is too large, which can cause out-of-memory exceptions. -

-

- CODE EXAMPLES -

-

- The following code example shows common usage of the library. Make sure tessdata - folder is populated with appropriate language data files and the .jar - files are in the classpath. On Windows, the DLLs will be automatically extracted - from tess4j.jar to the default temporary directory and loaded. -

-
-
-package net.sourceforge.tess4j.example;
-
-import java.io.File;
-import net.sourceforge.tess4j.*;
-
-public class TesseractExample {
-    public static void main(String[] args) {
-        // ImageIO.scanForPlugins(); // for server environment
-        File imageFile = new File("eurotext.tif");
-        ITesseract instance = new Tesseract(); // JNA Interface Mapping
-        // ITesseract instance = new Tesseract1(); // JNA Direct Mapping
-        // instance.setDatapath("<parentPath>"); // replace <parentPath> with path to parent directory of tessdata
-        // instance.setLanguage("eng");
-
-        try {
-            String result = instance.doOCR(imageFile);
-            System.out.println(result);
-        } catch (TesseractException e) {
-            System.err.println(e.getMessage());
-        }
-    }
-}
-
-
-

- DOCUMENTATIONS -

-

- Please visit the website for the library's documentations -

-
-
- - diff --git a/Tess4J/src/com/recognition/software/jdeskew/ImageDeskew.java b/Tess4J/src/com/recognition/software/jdeskew/ImageDeskew.java deleted file mode 100644 index f1b9b24..0000000 --- a/Tess4J/src/com/recognition/software/jdeskew/ImageDeskew.java +++ /dev/null @@ -1,175 +0,0 @@ -/** - * JDeskew - */ -package com.recognition.software.jdeskew; - -import java.awt.image.BufferedImage; - -public class ImageDeskew { - - /** - * Representation of a line in the image. - */ - public class HoughLine { - - // count of points in the line - public int count = 0; - // index in matrix. - public int index = 0; - // the line is represented as all x, y that solve y * cos(alpha) - x * - // sin(alpha) = d - public double alpha; - public double d; - } - - // the source image - private BufferedImage cImage; - // the range of angles to search for lines - private double cAlphaStart = -20; - private double cAlphaStep = 0.2; - private int cSteps = 40 * 5; - // pre-calculation of sin and cos - private double[] cSinA; - private double[] cCosA; - // range of d - private double cDMin; - private double cDStep = 1.0; - private int cDCount; - // count of points that fit in a line - private int[] cHMatrix; - - /** - * Constructor. - * - * @param image - */ - public ImageDeskew(BufferedImage image) { - this.cImage = image; - } - - /** - * Calculates the skew angle of the image cImage. - * - * @return - */ - public double getSkewAngle() { - ImageDeskew.HoughLine[] hl; - double sum = 0.0; - int count = 0; - - // perform Hough Transformation - calc(); - // top 20 of the detected lines in the image - hl = getTop(20); - - if (hl.length >= 20) { - // average angle of the lines - for (int i = 0; i < 19; i++) { - sum += hl[i].alpha; - count++; - } - return (sum / count); - } else { - return 0.0d; - } - } - - // calculate the count lines in the image with most points - private ImageDeskew.HoughLine[] getTop(int count) { - - ImageDeskew.HoughLine[] hl = new ImageDeskew.HoughLine[count]; - for (int i = 0; i < count; i++) { - hl[i] = new ImageDeskew.HoughLine(); - } - - ImageDeskew.HoughLine tmp; - - for (int i = 0; i < (this.cHMatrix.length - 1); i++) { - if (this.cHMatrix[i] > hl[count - 1].count) { - hl[count - 1].count = this.cHMatrix[i]; - hl[count - 1].index = i; - int j = count - 1; - while ((j > 0) && (hl[j].count > hl[j - 1].count)) { - tmp = hl[j]; - hl[j] = hl[j - 1]; - hl[j - 1] = tmp; - j--; - } - } - } - - int alphaIndex; - int dIndex; - - for (int i = 0; i < count; i++) { - dIndex = hl[i].index / cSteps; // integer division, no - // remainder - alphaIndex = hl[i].index - dIndex * cSteps; - hl[i].alpha = getAlpha(alphaIndex); - hl[i].d = dIndex + cDMin; - } - - return hl; - } - - // Hough Transformation - private void calc() { - int hMin = (int) ((this.cImage.getHeight()) / 4.0); - int hMax = (int) ((this.cImage.getHeight()) * 3.0 / 4.0); - init(); - - for (int y = hMin; y < hMax; y++) { - for (int x = 1; x < (this.cImage.getWidth() - 2); x++) { - // only lower edges are considered - if (ImageUtil.isBlack(this.cImage, x, y)) { - if (!ImageUtil.isBlack(this.cImage, x, y + 1)) { - calc(x, y); - } - } - } - } - - } - - // calculate all lines through the point (x,y) - private void calc(int x, int y) { - double d; - int dIndex; - int index; - - for (int alpha = 0; alpha < (this.cSteps - 1); alpha++) { - d = y * this.cCosA[alpha] - x * this.cSinA[alpha]; - dIndex = (int) (d - this.cDMin); - index = dIndex * this.cSteps + alpha; - try { - this.cHMatrix[index] += 1; - } catch (Exception ex) { - System.out.println(ex.toString()); - } - } - } - - private void init() { - - double angle; - - // pre-calculation of sin and cos - this.cSinA = new double[this.cSteps - 1]; - this.cCosA = new double[this.cSteps - 1]; - - for (int i = 0; i < (this.cSteps - 1); i++) { - angle = getAlpha(i) * Math.PI / 180.0; - this.cSinA[i] = Math.sin(angle); - this.cCosA[i] = Math.cos(angle); - } - - // range of d - this.cDMin = -this.cImage.getWidth(); - this.cDCount = (int) (2.0 * ((this.cImage.getWidth() + this.cImage.getHeight())) / this.cDStep); - this.cHMatrix = new int[this.cDCount * this.cSteps]; - } - - public double getAlpha(int index) { - return this.cAlphaStart + (index * this.cAlphaStep); - } -} diff --git a/Tess4J/src/com/recognition/software/jdeskew/ImageUtil.java b/Tess4J/src/com/recognition/software/jdeskew/ImageUtil.java deleted file mode 100644 index 3959812..0000000 --- a/Tess4J/src/com/recognition/software/jdeskew/ImageUtil.java +++ /dev/null @@ -1,132 +0,0 @@ -/** - * JDeskew - */ -package com.recognition.software.jdeskew; - -import java.awt.Color; -import java.awt.Graphics2D; -import java.awt.RenderingHints; -import java.awt.geom.AffineTransform; -import java.awt.image.BufferedImage; -import java.awt.image.WritableRaster; - -public class ImageUtil { - - /** - * Whether the pixel is black. - * - * @param image source image - * @param x - * @param y - * @return - */ - public static boolean isBlack(BufferedImage image, int x, int y) { - if (image.getType() == BufferedImage.TYPE_BYTE_BINARY) { - WritableRaster raster = image.getRaster(); - int pixelRGBValue = raster.getSample(x, y, 0); - return pixelRGBValue == 0; - } - - int luminanceValue = 140; - return isBlack(image, x, y, luminanceValue); - } - - /** - * Whether the pixel is black. - * - * @param image source image - * @param x - * @param y - * @param luminanceCutOff - * @return - */ - public static boolean isBlack(BufferedImage image, int x, int y, int luminanceCutOff) { - int pixelRGBValue; - int r; - int g; - int b; - double luminance = 0.0; - - // return white on areas outside of image boundaries - if (x < 0 || y < 0 || x > image.getWidth() || y > image.getHeight()) { - return false; - } - - try { - pixelRGBValue = image.getRGB(x, y); - r = (pixelRGBValue >> 16) & 0xff; - g = (pixelRGBValue >> 8) & 0xff; - b = (pixelRGBValue) & 0xff; - luminance = (r * 0.299) + (g * 0.587) + (b * 0.114); - } catch (Exception e) { - // ignore. - } - - return luminance < luminanceCutOff; - } - - /** - * Rotates image. - * - * @param image source image - * @param angle by degrees - * @param cx x-coordinate of pivot point - * @param cy y-coordinate of pivot point - * @return rotated image - */ - public static BufferedImage rotate(BufferedImage image, double angle, int cx, int cy) { - int width = image.getWidth(null); - int height = image.getHeight(null); - - int minX, minY, maxX, maxY; - minX = minY = maxX = maxY = 0; - - int[] corners = {0, 0, width, 0, width, height, 0, height}; - - double theta = Math.toRadians(angle); - for (int i = 0; i < corners.length; i += 2) { - int x = (int) (Math.cos(theta) * (corners[i] - cx) - - Math.sin(theta) * (corners[i + 1] - cy) + cx); - int y = (int) (Math.sin(theta) * (corners[i] - cx) - + Math.cos(theta) * (corners[i + 1] - cy) + cy); - - if (x > maxX) { - maxX = x; - } - - if (x < minX) { - minX = x; - } - - if (y > maxY) { - maxY = y; - } - - if (y < minY) { - minY = y; - } - - } - - cx = (cx - minX); - cy = (cy - minY); - - BufferedImage bi = new BufferedImage((maxX - minX), (maxY - minY), - image.getType()); - Graphics2D g2 = bi.createGraphics(); - g2.setRenderingHint(RenderingHints.KEY_INTERPOLATION, - RenderingHints.VALUE_INTERPOLATION_BICUBIC); - - g2.setBackground(Color.white); - g2.fillRect(0, 0, bi.getWidth(), bi.getHeight()); - - AffineTransform at = new AffineTransform(); - at.rotate(theta, cx, cy); - - g2.setTransform(at); - g2.drawImage(image, -minX, -minY, null); - g2.dispose(); - - return bi; - } -} diff --git a/Tess4J/src/net/sourceforge/tess4j/ITessAPI.java b/Tess4J/src/net/sourceforge/tess4j/ITessAPI.java deleted file mode 100644 index be513e4..0000000 --- a/Tess4J/src/net/sourceforge/tess4j/ITessAPI.java +++ /dev/null @@ -1,617 +0,0 @@ -/** - * Copyright @ 2014 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package net.sourceforge.tess4j; - -import com.sun.jna.Callback; -import com.sun.jna.NativeLong; -import com.sun.jna.Pointer; -import com.sun.jna.PointerType; -import com.sun.jna.Structure; -import java.util.Arrays; -import java.util.List; - -/** - * An interface represents common TessAPI classes/constants. - */ -public interface ITessAPI { - - /** - * When Tesseract/Cube is initialized we can choose to instantiate/load/run - * only the Tesseract part, only the Cube part or both along with the - * combiner. The preference of which engine to use is stored in - * tessedit_ocr_engine_mode.
- *
- * ATTENTION: When modifying this enum, please make sure to make the - * appropriate changes to all the enums mirroring it (e.g. OCREngine in - * cityblock/workflow/detection/detection_storage.proto). Such enums will - * mention the connection to OcrEngineMode in the comments. - */ - public static interface TessOcrEngineMode { - - /** - * Run Tesseract only - fastest - */ - public static final int OEM_TESSERACT_ONLY = 0; - /** - * Run Cube only - better accuracy, but slower - */ - public static final int OEM_CUBE_ONLY = 1; - /** - * Run both and combine results - best accuracy - */ - public static final int OEM_TESSERACT_CUBE_COMBINED = 2; - /** - * Specify this mode when calling init_*(), to indicate - * that any of the above modes should be automatically inferred from the - * variables in the language-specific config, command-line configs, or - * if not specified in any of the above should be set to the default - * OEM_TESSERACT_ONLY. - */ - public static final int OEM_DEFAULT = 3; - }; - - /** - * Possible modes for page layout analysis. These *must* be kept in order of - * decreasing amount of layout analysis to be done, except for - * OSD_ONLY, so that the inequality test macros below work. - */ - public static interface TessPageSegMode { - - /** - * Orientation and script detection only. - */ - public static final int PSM_OSD_ONLY = 0; - /** - * Automatic page segmentation with orientation and script detection. - * (OSD) - */ - public static final int PSM_AUTO_OSD = 1; - /** - * Automatic page segmentation, but no OSD, or OCR. - */ - public static final int PSM_AUTO_ONLY = 2; - /** - * Fully automatic page segmentation, but no OSD. - */ - public static final int PSM_AUTO = 3; - /** - * Assume a single column of text of variable sizes. - */ - public static final int PSM_SINGLE_COLUMN = 4; - /** - * Assume a single uniform block of vertically aligned text. - */ - public static final int PSM_SINGLE_BLOCK_VERT_TEXT = 5; - /** - * Assume a single uniform block of text. - */ - public static final int PSM_SINGLE_BLOCK = 6; - /** - * Treat the image as a single text line. - */ - public static final int PSM_SINGLE_LINE = 7; - /** - * Treat the image as a single word. - */ - public static final int PSM_SINGLE_WORD = 8; - /** - * Treat the image as a single word in a circle. - */ - public static final int PSM_CIRCLE_WORD = 9; - /** - * Treat the image as a single character. - */ - public static final int PSM_SINGLE_CHAR = 10; - /** - * Find as much text as possible in no particular order. - */ - public static final int PSM_SPARSE_TEXT = 11; - /** - * Sparse text with orientation and script detection. - */ - public static final int PSM_SPARSE_TEXT_OSD = 12; - /** - * Number of enum entries. - */ - public static final int PSM_COUNT = 13; - }; - - /** - * Enum of the elements of the page hierarchy, used in - * ResultIterator to provide functions that operate on each - * level without having to have 5x as many functions. - */ - public static interface TessPageIteratorLevel { - - /** - * Block of text/image/separator line. - */ - public static final int RIL_BLOCK = 0; - /** - * Paragraph within a block. - */ - public static final int RIL_PARA = 1; - /** - * Line within a paragraph. - */ - public static final int RIL_TEXTLINE = 2; - /** - * Word within a textline. - */ - public static final int RIL_WORD = 3; - /** - * Symbol/character within a word. - */ - public static final int RIL_SYMBOL = 4; - }; - - /** - * Possible types for a POLY_BLOCK or ColPartition. Must be kept in sync - * with kPBColors in polyblk.cpp and PTIs*Type - * functions below, as well as kPolyBlockNames in - * publictypes.cpp. Used extensively by ColPartition, and POLY_BLOCK. - */ - public static interface TessPolyBlockType { - - /** - * Type is not yet known. Keep as the first element. - */ - public static final int PT_UNKNOWN = 0; - /** - * Text that lives inside a column. - */ - public static final int PT_FLOWING_TEXT = 1; - /** - * Text that spans more than one column. - */ - public static final int PT_HEADING_TEXT = 2; - /** - * Text that is in a cross-column pull-out region. - */ - public static final int PT_PULLOUT_TEXT = 3; - /** - * Partition belonging to an equation region. - */ - public static final int PT_EQUATION = 4; - /** - * Partition has inline equation. - */ - public static final int PT_INLINE_EQUATION = 5; - /** - * Partition belonging to a table region. - */ - public static final int PT_TABLE = 6; - /** - * Text-line runs vertically. - */ - public static final int PT_VERTICAL_TEXT = 7; - /** - * Text that belongs to an image. - */ - public static final int PT_CAPTION_TEXT = 8; - /** - * Image that lives inside a column. - */ - public static final int PT_FLOWING_IMAGE = 9; - /** - * Image that spans more than one column. - */ - public static final int PT_HEADING_IMAGE = 10; - /** - * Image that is in a cross-column pull-out region. - */ - public static final int PT_PULLOUT_IMAGE = 11; - /** - * Horizontal Line. - */ - public static final int PT_HORZ_LINE = 12; - /** - * Vertical Line. - */ - public static final int PT_VERT_LINE = 13; - /** - * Lies outside of any column. - */ - public static final int PT_NOISE = 14; - /** - * Number of enum entries. - */ - public static final int PT_COUNT = 15; - }; - - /** - * NOTA BENE: Fully justified paragraphs (text aligned to both left and - * right margins) are marked by Tesseract with JUSTIFICATION_LEFT if their - * text is written with a left-to-right script and with JUSTIFICATION_RIGHT - * if their text is written in a right-to-left script.
- *
- * Interpretation for text read in vertical lines: "Left" is wherever the - * starting reading position is. - */ - public static interface TessParagraphJustification { - - /** - * The alignment is not clearly one of the other options. This could - * happen for example if there are only one or two lines of text or the - * text looks like source code or poetry. - */ - public static final int JUSTIFICATION_UNKNOWN = 0; - /** - * Each line, except possibly the first, is flush to the same left tab - * stop. - */ - public static final int JUSTIFICATION_LEFT = 1; - /** - * The text lines of the paragraph are centered about a line going down - * through their middle of the text lines. - */ - public static final int JUSTIFICATION_CENTER = 2; - /** - * Each line, except possibly the first, is flush to the same right tab - * stop. - */ - public static final int JUSTIFICATION_RIGHT = 3; - } - - /** - *
-     *  +------------------+
-     *  | 1 Aaaa Aaaa Aaaa |
-     *  | Aaa aa aaa aa    |
-     *  | aaaaaa A aa aaa. |
-     *  |                2 |
-     *  |   #######  c c C |
-     *  |   #######  c c c |
-     *  | < #######  c c c |
-     *  | < #######  c   c |
-     *  | < #######  .   c |
-     *  | 3 #######      c |
-     *  +------------------+
-     * 
Orientation Example: - *
- * ==================== - *
- * Above is a diagram of some (1) English and (2) Chinese text and a (3) - * photo credit.
- *
- * Upright Latin characters are represented as A and a. '<' represents a - * latin character rotated anti-clockwise 90 degrees. Upright Chinese - * characters are represented C and c.
- *
NOTA BENE: enum values here should match goodoc.proto
- *
If you orient your head so that "up" aligns with Orientation, then - * the characters will appear "right side up" and readable.
- *
- * In the example above, both the English and Chinese paragraphs are - * oriented so their "up" is the top of the page (page up). The photo credit - * is read with one's head turned leftward ("up" is to page left).
- *
- * The values of this enum match the convention of Tesseract's osdetect.h - */ - public static interface TessOrientation { - - public static final int ORIENTATION_PAGE_UP = 0; - public static final int ORIENTATION_PAGE_RIGHT = 1; - public static final int ORIENTATION_PAGE_DOWN = 2; - public static final int ORIENTATION_PAGE_LEFT = 3; - }; - - /** - * The grapheme clusters within a line of text are laid out logically in - * this direction, judged when looking at the text line rotated so that its - * Orientation is "page up".
- *
- * For English text, the writing direction is left-to-right. For the Chinese - * text in the above example, the writing direction is top-to-bottom. - */ - public static interface TessWritingDirection { - - public static final int WRITING_DIRECTION_LEFT_TO_RIGHT = 0; - public static final int WRITING_DIRECTION_RIGHT_TO_LEFT = 1; - public static final int WRITING_DIRECTION_TOP_TO_BOTTOM = 2; - }; - - /** - * The text lines are read in the given sequence.
- *
- * In English, the order is top-to-bottom. In Chinese, vertical text lines - * are read right-to-left. Mongolian is written in vertical columns top to - * bottom like Chinese, but the lines order left-to right.
- *
- * Note that only some combinations make sense. For example, - * WRITING_DIRECTION_LEFT_TO_RIGHT implies - * TEXTLINE_ORDER_TOP_TO_BOTTOM. - */ - public static interface TessTextlineOrder { - - public static final int TEXTLINE_ORDER_LEFT_TO_RIGHT = 0; - public static final int TEXTLINE_ORDER_RIGHT_TO_LEFT = 1; - public static final int TEXTLINE_ORDER_TOP_TO_BOTTOM = 2; - }; - - public static final int TRUE = 1; - public static final int FALSE = 0; - - /** - * Base class for all tesseract APIs. Specific classes can add ability to - * work on different inputs or produce different outputs. This class is - * mostly an interface layer on top of the Tesseract instance class to hide - * the data types so that users of this class don't have to include any - * other Tesseract headers. - */ - public static class TessBaseAPI extends PointerType { - - public TessBaseAPI(Pointer address) { - super(address); - } - - public TessBaseAPI() { - super(); - } - }; - - /** - * Class to iterate over tesseract page structure, providing access to all - * levels of the page hierarchy, without including any tesseract headers or - * having to handle any tesseract structures.
- * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, - * SetImage, Recognize, Clear, - * End DetectOS, or anything else that changes the - * internal PAGE_RES. See apitypes.h for the - * definition of PageIteratorLevel. See also - * ResultIterator, derived from PageIterator, - * which adds in the ability to access OCR output with text-specific - * methods. - */ - public static class TessPageIterator extends PointerType { - - public TessPageIterator(Pointer address) { - super(address); - } - - public TessPageIterator() { - super(); - } - }; - - /** - * MutableIterator adds access to internal data structures. - */ - public static class TessMutableIterator extends PointerType { - - public TessMutableIterator(Pointer address) { - super(address); - } - - public TessMutableIterator() { - super(); - } - }; - - /** - * Iterator for tesseract results that is capable of iterating in proper - * reading order over Bi Directional (e.g. mixed Hebrew and English) text. - * ResultIterator adds text-specific methods for access to OCR output. - */ - public static class TessResultIterator extends PointerType { - - public TessResultIterator(Pointer address) { - super(address); - } - - public TessResultIterator() { - super(); - } - }; - - public static class TessChoiceIterator extends PointerType { - - public TessChoiceIterator(Pointer address) { - super(address); - } - - public TessChoiceIterator() { - super(); - } - }; - - /** - * Interface for rendering tesseract results into a document, such as text, - * HOCR or pdf. This class is abstract. Specific classes handle individual - * formats. This interface is then used to inject the renderer class into - * tesseract when processing images. - * - * For simplicity implementing this with tesseract version 3.01, the - * renderer contains document state that is cleared from document to - * document just as the TessBaseAPI is. This way the base API can just - * delegate its rendering functionality to injected renderers, and the - * renderers can manage the associated state needed for the specific formats - * in addition to the heuristics for producing it. - */ - public static class TessResultRenderer extends PointerType { - - public TessResultRenderer(Pointer address) { - super(address); - } - - public TessResultRenderer() { - super(); - } - }; - - /** - * Description of the output of the OCR engine. This structure is used as - * both a progress monitor and the final output header, since it needs to be - * a valid progress monitor while the OCR engine is storing its output to - * shared memory. During progress, all the buffer info is -1. Progress - * starts at 0 and increases to 100 during OCR. No other constraint. Every - * progress callback, the OCR engine must set ocr_alive to 1. - * The HP side will set ocr_alive to 0. Repeated failure to - * reset to 1 indicates that the OCR engine is dead. If the cancel function - * is not null then it is called with the number of user words found. If it - * returns true then operation is cancelled. - */ - public static class ETEXT_DESC extends Structure { - - /** - * chars in this buffer(0). Total number of UTF-8 bytes for this run. - */ - public short count; - /** - * percent complete increasing (0-100) - */ - public short progress; - /** - * true if not last - */ - public byte more_to_come; - /** - * ocr sets to 1, HP 0 - */ - public byte ocr_alive; - /** - * for errcode use - */ - public byte err_code; - /** - * returns true to cancel - */ - public CANCEL_FUNC cancel; - /** - * this or other data for cancel - */ - public Pointer cancel_this; - /** - * time to stop if not 0 - */ - public TimeVal end_time; - /** - * character data - */ - public EANYCODE_CHAR[] text = new EANYCODE_CHAR[1]; - - /** - * Gets Field Order. - * - * @return - */ - @Override - protected List getFieldOrder() { - return Arrays.asList("count", "progress", "more_to_come", "ocr_alive", "err_code", "cancel", "cancel_this", "end_time", "text"); - } - } - - /** - * It should be noted that the format for char_code for version 2.0 and - * beyond is UTF-8, which means that ASCII characters will come out as one - * structure but other characters will be returned in two or more instances - * of this structure with a single byte of the UTF-8 code in each, but each - * will have the same bounding box.
- *
- * Programs which want to handle languages with different characters sets - * will need to handle extended characters appropriately, but - * all - * code needs to be prepared to receive UTF-8 coded characters for - * characters such as bullet and fancy quotes. - */ - public static class EANYCODE_CHAR extends Structure { - - /** - * character itself, one single UTF-8 byte long. A Unicode character may - * consist of one or more UTF-8 bytes. Bytes of a character will have - * the same bounding box. - */ - public byte char_code; - /** - * left of char (-1) - */ - public short left; - /** - * right of char (-1) - */ - public short right; - /** - * top of char (-1) - */ - public short top; - /** - * bottom of char (-1) - */ - public short bottom; - /** - * what font (0) - */ - public short font_index; - /** - * classification confidence: 0=perfect, 100=reject (0/100) - */ - public byte confidence; - /** - * point size of char, 72 = 1 inch, (10) - */ - public byte point_size; - /** - * number of spaces before this char (1) - */ - public byte blanks; - /** - * char formatting (0) - */ - public byte formatting; - - /** - * Gets Field Order. - * - * @return - */ - @Override - protected List getFieldOrder() { - return Arrays.asList("char_code", "left", "right", "top", "bottom", "font_index", "confidence", "point_size", "blanks", "formatting"); - } - } - - /** - * Callback for cancel_func. - */ - interface CANCEL_FUNC extends Callback { - - /** - * - * @param cancel_this - * @param words - * @return - */ - boolean invoke(Pointer cancel_this, int words); - }; - - public static class TimeVal extends Structure { - - /** - * seconds - */ - public NativeLong tv_sec; - /** - * microseconds - */ - public NativeLong tv_usec; - - @Override - protected List getFieldOrder() { - return Arrays.asList("tv_sec", "tv_usec"); - } - } -} diff --git a/Tess4J/src/net/sourceforge/tess4j/ITesseract.java b/Tess4J/src/net/sourceforge/tess4j/ITesseract.java deleted file mode 100644 index b2032f4..0000000 --- a/Tess4J/src/net/sourceforge/tess4j/ITesseract.java +++ /dev/null @@ -1,236 +0,0 @@ -/** - * Copyright @ 2014 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package net.sourceforge.tess4j; - -import java.awt.Rectangle; -import java.awt.image.BufferedImage; -import java.io.File; -import java.nio.ByteBuffer; -import java.util.List; -import javax.imageio.IIOImage; - -/** - * An interface represents common OCR methods. - */ -public interface ITesseract { - - String htmlBeginTag = "\n" - + "\n\n\n" - + "\n\n" - + "\n\n"; - String htmlEndTag = "\n\n"; - - /** - * Rendered formats supported by Tesseract. - */ - public enum RenderedFormat { - - TEXT, HOCR, PDF, UNLV, BOX - } - - /** - * Performs OCR operation. - * - * @param imageFile an image file - * @return the recognized text - * @throws TesseractException - */ - String doOCR(File imageFile) throws TesseractException; - - /** - * Performs OCR operation. - * - * @param imageFile an image file - * @param rect the bounding rectangle defines the region of the image to be - * recognized. A rectangle of zero dimension or null indicates - * the whole image. - * @return the recognized text - * @throws TesseractException - */ - String doOCR(File imageFile, Rectangle rect) throws TesseractException; - - /** - * Performs OCR operation. - * - * @param bi a buffered image - * @return the recognized text - * @throws TesseractException - */ - String doOCR(BufferedImage bi) throws TesseractException; - - /** - * Performs OCR operation. - * - * @param bi a buffered image - * @param rect the bounding rectangle defines the region of the image to be - * recognized. A rectangle of zero dimension or null indicates - * the whole image. - * @return the recognized text - * @throws TesseractException - */ - String doOCR(BufferedImage bi, Rectangle rect) throws TesseractException; - - /** - * Performs OCR operation. - * - * @param imageList a list of IIOImage objects - * @param rect the bounding rectangle defines the region of the image to be - * recognized. A rectangle of zero dimension or null indicates - * the whole image. - * @return the recognized text - * @throws TesseractException - */ - String doOCR(List imageList, Rectangle rect) throws TesseractException; - - /** - * Performs OCR operation. - * - * @param imageList a list of IIOImage objects - * @param filename input file name. Needed only for training and reading a - * UNLV zone file. - * @param rect the bounding rectangle defines the region of the image to be - * recognized. A rectangle of zero dimension or null indicates - * the whole image. - * @return the recognized text - * @throws TesseractException - */ - String doOCR(List imageList, String filename, Rectangle rect) throws TesseractException; - - /** - * Performs OCR operation. Use SetImage, (optionally) - * SetRectangle, and one or more of the Get*Text - * functions. - * - * @param xsize width of image - * @param ysize height of image - * @param buf pixel data - * @param rect the bounding rectangle defines the region of the image to be - * recognized. A rectangle of zero dimension or null indicates - * the whole image. - * @param bpp bits per pixel, represents the bit depth of the image, with 1 - * for binary bitmap, 8 for gray, and 24 for color RGB. - * @return the recognized text - * @throws TesseractException - */ - String doOCR(int xsize, int ysize, ByteBuffer buf, Rectangle rect, int bpp) throws TesseractException; - - /** - * Performs OCR operation. Use SetImage, (optionally) - * SetRectangle, and one or more of the Get*Text - * functions. - * - * @param xsize width of image - * @param ysize height of image - * @param buf pixel data - * @param filename input file name. Needed only for training and reading a - * UNLV zone file. - * @param rect the bounding rectangle defines the region of the image to be - * recognized. A rectangle of zero dimension or null indicates - * the whole image. - * @param bpp bits per pixel, represents the bit depth of the image, with 1 - * for binary bitmap, 8 for gray, and 24 for color RGB. - * @return the recognized text - * @throws TesseractException - */ - String doOCR(int xsize, int ysize, ByteBuffer buf, String filename, Rectangle rect, int bpp) throws TesseractException; - - /** - * Sets tessdata path. - * - * @param datapath the tessdata path to set - */ - void setDatapath(String datapath); - - /** - * Sets language for OCR. - * - * @param language the language code, which follows ISO 639-3 standard. - */ - void setLanguage(String language); - - /** - * Sets OCR engine mode. - * - * @param ocrEngineMode the OcrEngineMode to set - */ - void setOcrEngineMode(int ocrEngineMode); - - /** - * Sets page segmentation mode. - * - * @param mode the page segmentation mode to set - */ - void setPageSegMode(int mode); - - /** - * Sets the value of Tesseract's internal parameter. - * - * @param key variable name, e.g., tessedit_create_hocr, - * tessedit_char_whitelist, etc. - * @param value value for corresponding variable, e.g., "1", "0", - * "0123456789", etc. - */ - void setTessVariable(String key, String value); - - /** - * Sets configs to be passed to Tesseract's Init method. - * - * @param configs list of config filenames, e.g., "digits", "bazaar", - * "quiet" - */ - void setConfigs(List configs); - - /** - * Creates documents for given renderers. - * - * @param filename input image - * @param outputbase output filename without extension - * @param formats types of renderers - * @throws TesseractException - */ - void createDocuments(String filename, String outputbase, List formats) throws TesseractException; - - /** - * Creates documents for given renderers. - * - * @param filenames array of input files - * @param outputbases array of output filenames without extension - * @param formats types of renderers - * @throws TesseractException - */ - void createDocuments(String[] filenames, String[] outputbases, List formats) throws TesseractException; - - /** - * Gets segmented regions at specified page iterator level. - * - * @param bi input image - * @param pageIteratorLevel TessPageIteratorLevel enum - * @return list of Rectangle - * @throws TesseractException - */ - List getSegmentedRegions(BufferedImage bi, int pageIteratorLevel) throws TesseractException; - - /** - * Gets recognized words at specified page iterator level. - * - * @param bi input image - * @param pageIteratorLevel TessPageIteratorLevel enum - * @return list of Word - */ - List getWords(BufferedImage bi, int pageIteratorLevel); -} diff --git a/Tess4J/src/net/sourceforge/tess4j/TessAPI.java b/Tess4J/src/net/sourceforge/tess4j/TessAPI.java deleted file mode 100644 index b9d2b93..0000000 --- a/Tess4J/src/net/sourceforge/tess4j/TessAPI.java +++ /dev/null @@ -1,1225 +0,0 @@ -/** - * Copyright @ 2012 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package net.sourceforge.tess4j; - -import java.nio.ByteBuffer; -import java.nio.DoubleBuffer; -import java.nio.FloatBuffer; -import java.nio.IntBuffer; - -import com.sun.jna.Library; -import com.sun.jna.Pointer; -import com.sun.jna.ptr.IntByReference; -import com.sun.jna.ptr.PointerByReference; - -import com.ochafik.lang.jnaerator.runtime.NativeSize; -import net.sourceforge.lept4j.Boxa; -import net.sourceforge.lept4j.Pix; -import net.sourceforge.tess4j.util.LoadLibs; - -/** - * A Java wrapper for Tesseract OCR 3.04 API using - * JNA Interface Mapping. - */ -public interface TessAPI extends Library, ITessAPI { - - /** - * An instance of the class library. - */ - public static final TessAPI INSTANCE = LoadLibs.getTessAPIInstance(); - - /** - * Gets the version identifier. - * - * @return the version identifier - */ - String TessVersion(); - - /** - * Deallocates the memory block occupied by text. - * - * @param text the pointer to text - */ - void TessDeleteText(Pointer text); - - /** - * Deallocates the memory block occupied by text array. - * - * @param arr text array pointer reference - */ - void TessDeleteTextArray(PointerByReference arr); - - /** - * Deallocates the memory block occupied by integer array. - * - * @param arr int array - */ - void TessDeleteIntArray(IntBuffer arr); - - /* Renderer API */ - TessResultRenderer TessTextRendererCreate(String outputbase); - - TessResultRenderer TessHOcrRendererCreate(String outputbase); - - TessResultRenderer TessHOcrRendererCreate2(String outputbase, int font_info); - - TessResultRenderer TessPDFRendererCreate(String outputbase, String datadir); - - TessResultRenderer TessPDFRendererCreateTextonly(String outputbase, String datadir, int textonly); - - TessResultRenderer TessUnlvRendererCreate(String outputbase); - - TessResultRenderer TessBoxTextRendererCreate(String outputbase); - - void TessDeleteResultRenderer(TessResultRenderer renderer); - - void TessResultRendererInsert(TessResultRenderer renderer, TessResultRenderer next); - - TessResultRenderer TessResultRendererNext(TessResultRenderer renderer); - - int TessResultRendererBeginDocument(TessResultRenderer renderer, String title); - - int TessResultRendererAddImage(TessResultRenderer renderer, PointerByReference api); - - int TessResultRendererEndDocument(TessResultRenderer renderer); - - Pointer TessResultRendererExtention(TessResultRenderer renderer); - - Pointer TessResultRendererTitle(TessResultRenderer renderer); - - int TessResultRendererImageNum(TessResultRenderer renderer); - - /** - * Creates an instance of the base class for all Tesseract APIs. - * - * @return the TesseractAPI instance - */ - TessBaseAPI TessBaseAPICreate(); - - /** - * Disposes the TesseractAPI instance. - * - * @param handle the TesseractAPI instance - */ - void TessBaseAPIDelete(TessBaseAPI handle); - - /** - * Set the name of the input file. Needed only for training and reading a - * UNLV zone file, and for searchable PDF output. - * - * @param handle the TesseractAPI instance - * @param name name of the input file - */ - void TessBaseAPISetInputName(TessBaseAPI handle, String name); - - /** - * These functions are required for searchable PDF output. We need our hands - * on the input file so that we can include it in the PDF without - * transcoding. If that is not possible, we need the original image. - * Finally, resolution metadata is stored in the PDF so we need that as - * well. - * - * @param handle the TesseractAPI instance - * @return input file name - */ - String TessBaseAPIGetInputName(TessBaseAPI handle); - - void TessBaseAPISetInputImage(TessBaseAPI handle, Pix pix); - - Pix TessBaseAPIGetInputImage(TessBaseAPI handle); - - int TessBaseAPIGetSourceYResolution(TessBaseAPI handle); - - String TessBaseAPIGetDatapath(TessBaseAPI handle); - - /** - * Set the name of the bonus output files. Needed only for debugging. - * - * @param handle the TesseractAPI instance - * @param name name of the output file - */ - void TessBaseAPISetOutputName(TessBaseAPI handle, String name); - - /** - * Set the value of an internal "parameter." Supply the name of the - * parameter and the value as a string, just as you would in a config file. - * Returns false if the name lookup failed. E.g., - * SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, - * y and z. Or SetVariable("classify_bln_numeric_mode", "1"); - * to set numeric-only mode. SetVariable may be used before - * Init, but settings will revert to defaults on - * End().
- *
- * Note: Must be called after Init(). Only works for non-init - * variables (init variables should be passed to Init()). - * - * - * @param handle the TesseractAPI instance - * @param name name of the input - * @param value variable value - * @return 1 on success - */ - int TessBaseAPISetVariable(TessBaseAPI handle, String name, String value); - - /** - * Get the value of an internal int parameter. - * - * @param handle the TesseractAPI instance - * @param name name of the input - * @param value pass the int buffer value - * @return 1 on success - */ - int TessBaseAPIGetIntVariable(TessBaseAPI handle, String name, IntBuffer value); - - /** - * Get the value of an internal bool parameter. - * - * @param handle the TesseractAPI instance - * @param name pass the name of the variable - * @param value pass the int buffer value - * @return 1 on success - */ - int TessBaseAPIGetBoolVariable(TessBaseAPI handle, String name, IntBuffer value); - - /** - * Get the value of an internal double parameter. - * - * @param handle the TesseractAPI instance - * @param name pass the name of the variable - * @param value pass the double buffer value - * @return 1 on success - */ - int TessBaseAPIGetDoubleVariable(TessBaseAPI handle, String name, DoubleBuffer value); - - /** - * Get the value of an internal string parameter. - * - * @param handle the TesseractAPI instance - * @param name pass the name of the variable - * @return the string value - */ - String TessBaseAPIGetStringVariable(TessBaseAPI handle, String name); - - /** - * Print Tesseract parameters to the given file.
- *
- * Note: Must not be the first method called after instance create. - * - * @param handle the TesseractAPI instance - * @param filename name of the file where the variables will be persisted - */ - void TessBaseAPIPrintVariablesToFile(TessBaseAPI handle, String filename); - - /** - * Instances are now mostly thread-safe and totally independent, but some - * global parameters remain. Basically it is safe to use multiple - * TessBaseAPIs in different threads in parallel, UNLESS you use - * SetVariable on some of the Params in classify and textord. - * If you do, then the effect will be to change it for all your - * instances.
- *
- * Start tesseract. Returns zero on success and -1 on failure. NOTE that the - * only members that may be called before Init are those listed - * above here in the class definition.
- *
- * It is entirely safe (and eventually will be efficient too) to call - * Init multiple times on the same instance to change language, - * or just to reset the classifier. Languages may specify internally that - * they want to be loaded with one or more other languages, so the ~ - * sign is available to override that. E.g., if hin were set to - * load eng by default, then hin+~eng would force - * loading only hin. The number of loaded languages is limited - * only by memory, with the caveat that loading additional languages will - * impact both speed and accuracy, as there is more work to do to decide on - * the applicable language, and there is more chance of hallucinating - * incorrect words. WARNING: On changing languages, all Tesseract parameters - * are reset back to their default values. (Which may vary between - * languages.) If you have a rare need to set a Variable that controls - * initialization for a second call to Init you should - * explicitly call End() and then use SetVariable - * before Init.
- * This is only a very rare use case, since there are very few uses that - * require any parameters to be set before Init.
- *
- * If set_only_non_debug_params is true, only params that do - * not contain "debug" in the name will be set. - * - * @param handle the TesseractAPI instance - * @param datapath The datapath must be the name of the parent - * directory of tessdata and must end in - * /. Any name after the last / will be stripped. - * @param language The language is (usually) an ISO 639-3 - * string or NULL will default to eng. The - * language may be a string of the form [~]<lang>[+[~]<lang>] - * indicating that multiple languages are to be loaded. E.g., - * hin+eng will load Hindi and English. - * @param oem ocr engine mode - * @param configs pointer configuration - * @param configs_size pointer configuration size - * @return 0 on success and -1 on initialization failure - */ - int TessBaseAPIInit1(TessBaseAPI handle, String datapath, String language, int oem, - PointerByReference configs, int configs_size); - - /** - * @param handle the TesseractAPI instance - * @param datapath The datapath must be the name of the parent - * directory of tessdata and must end in - * /. Any name after the last / will be stripped. - * @param language The language is (usually) an ISO 639-3 - * string or NULL will default to eng. The - * language may be a string of the form [~]<lang>[+[~]<lang>] - * indicating that multiple languages are to be loaded. E.g., - * hin+eng will load Hindi and English. - * @param oem ocr engine mode - * @return 0 on success and -1 on initialization failure - */ - int TessBaseAPIInit2(TessBaseAPI handle, String datapath, String language, int oem); - - /** - * @param handle the TesseractAPI instance - * @param datapath The datapath must be the name of the parent - * directory of tessdata and must end in - * /. Any name after the last / will be stripped. - * @param language The language is (usually) an ISO 639-3 - * string or NULL will default to eng. The - * language may be a string of the form [~]<lang>[+[~]<lang>] - * indicating that multiple languages are to be loaded. E.g., - * hin+eng will load Hindi and English. - * @return 0 on success and -1 on initialization failure - */ - int TessBaseAPIInit3(TessBaseAPI handle, String datapath, String language); - - /** - * - * @param handle the TesseractAPI instance - * @param datapath The datapath must be the name of the parent - * directory of tessdata and must end in - * /. Any name after the last / will be stripped. - * @param language The language is (usually) an ISO 639-3 - * string or NULL will default to eng. The - * language may be a string of the form [~]<lang>[+[~]<lang>] - * indicating that multiple languages are to be loaded. E.g., - * hin+eng will load Hindi and English. - * @param oem ocr engine mode - * @param configs pointer configuration - * @param configs_size pointer configuration size - * @param vars_vec - * @param vars_values - * @param vars_vec_size - * @param set_only_non_debug_params - * @return 0 on success and -1 on initialization failure - */ - int TessBaseAPIInit4(TessBaseAPI handle, String datapath, String language, int oem, PointerByReference configs, int configs_size, PointerByReference vars_vec, PointerByReference vars_values, NativeSize vars_vec_size, int set_only_non_debug_params); - - /** - * Returns the languages string used in the last valid initialization. If - * the last initialization specified "deu+hin" then that will be returned. - * If hin loaded eng automatically as well, then - * that will not be included in this list. To find the languages actually - * loaded, use GetLoadedLanguagesAsVector. The returned string - * should NOT be deleted. - * - * @param handle the TesseractAPI instance - * @return languages as string - */ - String TessBaseAPIGetInitLanguagesAsString(TessBaseAPI handle); - - /** - * Returns the loaded languages in the vector of STRINGs. Includes all - * languages loaded by the last Init, including those loaded as - * dependencies of other loaded languages. - * - * @param handle the TesseractAPI instance - * @return loaded languages as vector - */ - PointerByReference TessBaseAPIGetLoadedLanguagesAsVector(TessBaseAPI handle); - - /** - * Returns the available languages in the vector of STRINGs. - * - * @param handle the TesseractAPI instance - * @return available languages as vector - */ - PointerByReference TessBaseAPIGetAvailableLanguagesAsVector(TessBaseAPI handle); - - /** - * Init only the lang model component of Tesseract. The only functions that - * work after this init are SetVariable and - * IsValidWord. WARNING: temporary! This function will be - * removed from here and placed in a separate API at some future time. - * - * @param handle the TesseractAPI instance - * @param datapath The datapath must be the name of the parent - * directory of tessdata and must end in - * /. Any name after the last / will be stripped. - * @param language The language is (usually) an ISO 639-3 - * string or NULL will default to eng. The language may be a - * string of the form [~]<lang>[+[~]<lang>] indicating that - * multiple languages are to be loaded. E.g., hin+eng will load Hindi and - * English. - * @return api init language mode - */ - int TessBaseAPIInitLangMod(TessBaseAPI handle, String datapath, String language); - - /** - * Init only for page layout analysis. Use only for calls to - * SetImage and AnalysePage. Calls that attempt - * recognition will generate an error. - * - * @param handle the TesseractAPI instance - */ - void TessBaseAPIInitForAnalysePage(TessBaseAPI handle); - - /** - * Read a "config" file containing a set of param, value pairs. Searches the - * standard places: tessdata/configs, - * tessdata/tessconfigs and also accepts a relative or absolute - * path name. Note: only non-init params will be set (init params are set by - * Init()). - * - * - * @param handle the TesseractAPI instance - * @param filename relative or absolute path for the "config" file - * containing a set of param and value pairs - * @param init_only - */ - void TessBaseAPIReadConfigFile(TessBaseAPI handle, String filename, int init_only); - - /** - * Set the current page segmentation mode. Defaults to - * PSM_SINGLE_BLOCK. The mode is stored as an IntParam so it - * can also be modified by ReadConfigFile or - * SetVariable("tessedit_pageseg_mode", mode as string). - * - * @param handle the TesseractAPI instance - * @param mode tesseract page segment mode - */ - void TessBaseAPISetPageSegMode(TessBaseAPI handle, int mode); - - /** - * Return the current page segmentation mode. - * - * @param handle the TesseractAPI instance - * @return page segment mode value - */ - int TessBaseAPIGetPageSegMode(TessBaseAPI handle); - - /** - * Recognize a rectangle from an image and return the result as a string. - * May be called many times for a single Init. Currently has no - * error checking. Greyscale of 8 and color of 24 or 32 bits per pixel may - * be given. Palette color images will not work properly and must be - * converted to 24 bit. Binary images of 1 bit per pixel may also be given - * but they must be byte packed with the MSB of the first byte being the - * first pixel, and a 1 represents WHITE. For binary images set - * bytes_per_pixel=0. The recognized text is returned as a char* which is - * coded as UTF8 and must be freed with the delete [] operator.
- *
- * Note that TesseractRect is the simplified convenience - * interface. For advanced uses, use SetImage, (optionally) - * SetRectangle, Recognize, and one or more of the - * Get*Text functions below. - * - * @param handle the TesseractAPI instance - * @param imagedata image byte buffer - * @param bytes_per_pixel bytes per pixel - * @param bytes_per_line bytes per line - * @param left image left - * @param top image top - * @param width image width - * @param height image height - * @return the pointer to recognized text - */ - Pointer TessBaseAPIRect(TessBaseAPI handle, ByteBuffer imagedata, int bytes_per_pixel, int bytes_per_line, - int left, int top, int width, int height); - - /** - * Call between pages or documents etc to free up memory and forget adaptive - * data. - * - * @param handle the TesseractAPI instance - */ - void TessBaseAPIClearAdaptiveClassifier(TessBaseAPI handle); - - /** - * Provide an image for Tesseract to recognize. Format is as - * TesseractRect above. Does not copy the image buffer, or take - * ownership. The source image may be destroyed after Recognize is called, - * either explicitly or implicitly via one of the Get*Text - * functions. SetImage clears all recognition results, and sets - * the rectangle to the full image, so it may be followed immediately by a - * GetUTF8Text, and it will automatically perform recognition. - * - * @param handle the TesseractAPI instance - * @param imagedata image byte buffer - * @param width image width - * @param height image height - * @param bytes_per_pixel bytes per pixel - * @param bytes_per_line bytes per line - */ - void TessBaseAPISetImage(TessBaseAPI handle, ByteBuffer imagedata, int width, int height, - int bytes_per_pixel, int bytes_per_line); - - /** - * Provide an image for Tesseract to recognize. As with - * SetImage above, Tesseract doesn't take a copy or ownership - * or pixDestroy the image, so it must persist until after - * Recognize. Pix vs raw, which to use? Use - * Pix where possible. A future version of Tesseract may choose - * to use Pix as its internal representation and discard - * IMAGE altogether. Because of that, an implementation that - * sources and targets Pix may end up with less copies than an - * implementation that does not. - * - * @param handle the TesseractAPI instance - * @param pix image - */ - void TessBaseAPISetImage2(TessBaseAPI handle, Pix pix); - - /** - * Set the resolution of the source image in pixels per inch so font size - * information can be calculated in results. Call this after - * SetImage(). - * - * @param handle the TesseractAPI instance - * @param ppi source resolution value - */ - void TessBaseAPISetSourceResolution(TessBaseAPI handle, int ppi); - - /** - * Restrict recognition to a sub-rectangle of the image. Call after - * SetImage. Each SetRectangle clears the - * recognition results so multiple rectangles can be recognized with the - * same image. - * - * @param handle the TesseractAPI instance - * @param left value - * @param top value - * @param width value - * @param height value - */ - void TessBaseAPISetRectangle(TessBaseAPI handle, int left, int top, int width, int height); - - /** - * ONLY available after SetImage if you have Leptonica - * installed. Get a copy of the internal thresholded image from Tesseract. - * - * @param handle the TesseractAPI instance - * @return internal thresholded image - */ - Pix TessBaseAPIGetThresholdedImage(TessBaseAPI handle); - - /** - * Get the result of page layout analysis as a Leptonica-style - * Boxa, Pixa pair, in reading order. Can be - * called before or after Recognize. - * - * @param handle the TesseractAPI instance - * @param pixa array of Pix - * @return array of Box - */ - Boxa TessBaseAPIGetRegions(TessBaseAPI handle, PointerByReference pixa); - - /** - * Get the textlines as a Leptonica-style Boxa, - * Pixa pair, in reading order. Can be called before or after - * Recognize. If blockids is not NULL, the - * block-id of each line is also returned as an array of one element per - * line. delete [] after use. If paraids is not - * NULL, the paragraph-id of each line within its block is also - * returned as an array of one element per line. delete [] after use.
- * Helper method to extract from the thresholded image (most common usage). - * - * @param handle the TesseractAPI instance - * @param pixa array of Pix - * @param blockids - * @return array of Box - */ - Boxa TessBaseAPIGetTextlines(TessBaseAPI handle, PointerByReference pixa, PointerByReference blockids); - - /** - * Get the textlines as a Leptonica-style Boxa, - * Pixa pair, in reading order. Can be called before or after - * Recognize. If blockids is not NULL, the - * block-id of each line is also returned as an array of one element per - * line. delete [] after use. If paraids is not - * NULL, the paragraph-id of each line within its block is also - * returned as an array of one element per line. delete [] after use. - * - * @param handle the TesseractAPI instance - * @param raw_image - * @param raw_padding - * @param pixa array of Pix - * @param blockids - * @param paraids - * @return array of Box - */ - Boxa TessBaseAPIGetTextlines1(TessBaseAPI handle, int raw_image, int raw_padding, PointerByReference pixa, PointerByReference blockids, PointerByReference paraids); - - /** - * Get textlines and strips of image regions as a Leptonica-style - * Boxa, Pixa pair, in reading order. Enables - * downstream handling of non-rectangular regions. Can be called before or - * after Recognize. If blockids is not NULL, the block-id of - * each line is also returned as an array of one element per line. delete [] - * after use. - * - * @param handle the TesseractAPI instance - * @param pixa array of Pix - * @param blockids - * @return array of Box - */ - Boxa TessBaseAPIGetStrips(TessBaseAPI handle, PointerByReference pixa, PointerByReference blockids); - - /** - * Get the words as a Leptonica-style Boxa, Pixa - * pair, in reading order. Can be called before or after - * Recognize. - * - * @param handle the TesseractAPI instance - * @param pixa array of Pix - * @return array of Box - */ - Boxa TessBaseAPIGetWords(TessBaseAPI handle, PointerByReference pixa); - - /** - * Gets the individual connected (text) components (created after pages - * segmentation step, but before recognition) as a Leptonica-style - * Boxa, Pixa pair, in reading order. Can be - * called before or after Recognize. - * - * @param handle the TesseractAPI instance - * @param cc array of Pix - * @return array of Box - */ - Boxa TessBaseAPIGetConnectedComponents(TessBaseAPI handle, PointerByReference cc); - - /** - * Get the given level kind of components (block, textline, word etc.) as a - * Leptonica-style Boxa, Pixa pair, in reading - * order. Can be called before or after Recognize. If blockids - * is not NULL, the block-id of each component is also returned - * as an array of one element per component. delete [] after use. If - * text_only is true, then only text components are returned. - * Helper function to get binary images with no padding (most common usage). - * - * @param handle the TesseractAPI instance - * @param level PageIteratorLevel - * @param text_only - * @param pixa array of Pix - * @param blockids - * @return array of Box - */ - Boxa TessBaseAPIGetComponentImages(TessBaseAPI handle, int level, int text_only, PointerByReference pixa, PointerByReference blockids); - - /** - * Get the given level kind of components (block, textline, word etc.) as a - * Leptonica-style Boxa, Pixa pair, in reading - * order. Can be called before or after Recognize. If blockids - * is not NULL, the block-id of each component is also returned - * as an array of one element per component. delete [] after use. If - * paraids is not NULL, the paragraph-id of each - * component with its block is also returned as an array of one element per - * component. delete [] after use. If raw_image is true, then - * portions of the original image are extracted instead of the thresholded - * image and padded with raw_padding. If text_only is true, - * then only text components are returned. - * - * @param handle the TesseractAPI instance - * @param level PageIteratorLevel - * @param text_only - * @param raw_image - * @param raw_padding - * @param pixa array of Pix - * @param blockids - * @param paraids - * @return - */ - Boxa TessBaseAPIGetComponentImages1(TessBaseAPI handle, int level, int text_only, int raw_image, int raw_padding, PointerByReference pixa, PointerByReference blockids, PointerByReference paraids); - - /** - * @param handle the TesseractAPI instance - * @return Scale factor from original image. - */ - int TessBaseAPIGetThresholdedImageScaleFactor(TessBaseAPI handle); - - /** - * Dump the internal binary image to a PGM file. - * - * @param handle the TesseractAPI instance - * @param filename pgm file name - */ - void TessBaseAPIDumpPGM(TessBaseAPI handle, String filename); - - /** - * Runs page layout analysis in the mode set by SetPageSegMode. - * May optionally be called prior to Recognize to get access to - * just the page layout results. Returns an iterator to the results. Returns - * NULL on error. The returned iterator must be deleted after - * use. WARNING! This class points to data held within the - * TessBaseAPI class, and therefore can only be used while the - * TessBaseAPI class still exists and has not been subjected to - * a call of Init, SetImage, - * Recognize, Clear, End, DetectOS, - * or anything else that changes the internal PAGE_RES. - * - * @param handle the TesseractAPI instance - * @return returns an iterator to the results. Returns NULL on error. The - * returned iterator must be deleted after use. - */ - TessPageIterator TessBaseAPIAnalyseLayout(TessBaseAPI handle); - - /** - * Recognize the image from SetAndThresholdImage, generating - * Tesseract internal structures. Returns 0 on success. Optional. The - * Get*Text functions below will call Recognize if - * needed. After Recognize, the output is kept internally until - * the next SetImage. - * - * @param handle the TesseractAPI instance - * @param monitor the result as Tesseract internal structures - * @return 0 on success - */ - int TessBaseAPIRecognize(TessBaseAPI handle, ETEXT_DESC monitor); - - /** - * Variant on Recognize used for testing chopper. - * - * @param handle the TesseractAPI instance - * @param monitor the result as Tesseract internal structures - * @return 0 on success - */ - int TessBaseAPIRecognizeForChopTest(TessBaseAPI handle, ETEXT_DESC monitor); - - /** - * Get a reading-order iterator to the results of LayoutAnalysis and/or - * Recognize. The returned iterator must be deleted after use. - * WARNING! This class points to data held within the - * TessBaseAPI class, and therefore can only be used while the - * TessBaseAPI class still exists and has not been subjected to - * a call of Init, SetImage, - * Recognize, Clear, End, DetectOS, - * or anything else that changes the internal PAGE_RES. - * - * @param handle the TesseractAPI instance - * @return the result iterator - */ - TessResultIterator TessBaseAPIGetIterator(TessBaseAPI handle); - - /** - * Get a mutable iterator to the results of LayoutAnalysis and/or - * Recognize. The returned iterator must be deleted after use. - * WARNING! This class points to data held within the - * TessBaseAPI class, and therefore can only be used while the - * TessBaseAPI class still exists and has not been subjected to - * a call of Init, SetImage, - * Recognize, Clear, End, DetectOS, - * or anything else that changes the internal PAGE_RES. - * - * @param handle the TesseractAPI instance - * @return the mutable iterator - */ - TessMutableIterator TessBaseAPIGetMutableIterator(TessBaseAPI handle); - - /** - * Recognizes all the pages in the named file, as a multi-page tiff or list - * of filenames, or single image, and gets the appropriate kind of text - * according to parameters: tessedit_create_boxfile, - * tessedit_make_boxes_from_boxes, - * tessedit_write_unlv, tessedit_create_hocr. - * Calls ProcessPage on each page in the input file, which may be a - * multi-page tiff, single-page other file format, or a plain text list of - * images to read. If tessedit_page_number is non-negative, processing - * begins at that page of a multi-page tiff file, or filelist. The text is - * returned in text_out. Returns false on error. If non-zero - * timeout_millisec terminates processing after the timeout on a single - * page. If non-NULL and non-empty, and some page fails for some reason, the - * page is reprocessed with the retry_config config file. Useful for - * interactively debugging a bad page. - * - * @param handle the TesseractAPI instance - * @param filename multi-page tiff or list of filenames - * @param retry_config retry config values - * @param timeout_millisec timeout value - * @param renderer result renderer - * @return the status - */ - int TessBaseAPIProcessPages(TessBaseAPI handle, String filename, String retry_config, int timeout_millisec, TessResultRenderer renderer); - - int TessBaseAPIProcessPage(TessBaseAPI handle, Pix pix, int page_index, String filename, String retry_config, int timeout_millisec, TessResultRenderer renderer); - - /** - * The recognized text is returned as a char* which is coded as UTF-8 and - * must be freed with the delete [] operator. - * - * @param handle the TesseractAPI instance - * @return the pointer to output text - */ - Pointer TessBaseAPIGetUTF8Text(TessBaseAPI handle); - - /** - * Make a HTML-formatted string with hOCR markup from the internal data - * structures. page_number is 0-based but will appear in the output as - * 1-based. - * - * @param handle the TesseractAPI instance - * @param page_number page number - * @return the pointer to hOCR text - */ - Pointer TessBaseAPIGetHOCRText(TessBaseAPI handle, int page_number); - - /** - * The recognized text is returned as a char* which is coded as a UTF8 box - * file and must be freed with the delete [] operator. page_number is a - * 0-base page index that will appear in the box file. - * - * @param handle the TesseractAPI instance - * @param page_number number of the page - * @return the pointer to box text - */ - Pointer TessBaseAPIGetBoxText(TessBaseAPI handle, int page_number); - - /** - * The recognized text is returned as a char* which is coded as UNLV format - * Latin-1 with specific reject and suspect codes and must be freed with the - * delete [] operator. - * - * @param handle the TesseractAPI instance - * @return the pointer to UNLV text - */ - Pointer TessBaseAPIGetUNLVText(TessBaseAPI handle); - - /** - * Returns the average word confidence for Tesseract page result. - * - * @param handle the TesseractAPI instance - * @return the (average) confidence value between 0 and 100. - */ - int TessBaseAPIMeanTextConf(TessBaseAPI handle); - - /** - * Returns an array of all word confidences, terminated by -1. The calling - * function must delete [] after use. The number of confidences should - * correspond to the number of space-delimited words in - * GetUTF8Text. - * - * @param handle the TesseractAPI instance - * @return all word confidences (between 0 and 100) in an array, terminated - * by -1 - */ - IntByReference TessBaseAPIAllWordConfidences(TessBaseAPI handle); - - /** - * Applies the given word to the adaptive classifier if possible. The word - * must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can tell the - * boundaries of the graphemes. Assumes that - * SetImage/SetRectangle have been used to set the - * image to the given word. The mode arg should be - * PSM_SINGLE_WORD or PSM_CIRCLE_WORD, as that - * will be used to control layout analysis. The currently set PageSegMode is - * preserved. - * - * @param handle the TesseractAPI instance - * @param mode tesseract page segment mode - * @param wordstr The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , - * so it can tell the boundaries of the graphemes. - * @return false if adaption was not possible for some reason. - */ - int TessBaseAPIAdaptToWordStr(TessBaseAPI handle, int mode, String wordstr); - - /** - * Free up recognition results and any stored image data, without actually - * freeing any recognition data that would be time-consuming to reload. - * Afterwards, you must call SetImage or - * TesseractRect before doing any Recognize or - * Get* operation. - * - * @param handle the TesseractAPI instance - */ - void TessBaseAPIClear(TessBaseAPI handle); - - /** - * Close down tesseract and free up all memory. End() is - * equivalent to destructing and reconstructing your TessBaseAPI. Once - * End() has been used, none of the other API functions may be - * used other than Init and anything declared above it in the - * class definition. - * - * @param handle the TesseractAPI instance - */ - void TessBaseAPIEnd(TessBaseAPI handle); - - /** - * Check whether a word is valid according to Tesseract's language model. - * - * @param handle the TesseractAPI instance - * @param word word value - * @return 0 if the word is invalid, non-zero if valid - */ - int TessBaseAPIIsValidWord(TessBaseAPI handle, String word); - - /** - * Gets text direction. - * - * @param handle the TesseractAPI instance - * @param out_offset offset - * @param out_slope slope - * @return TRUE if text direction is valid - */ - int TessBaseAPIGetTextDirection(TessBaseAPI handle, IntBuffer out_offset, FloatBuffer out_slope); - - /** - * Clear any library-level memory caches. There are a variety of - * expensive-to-load constant data structures (mostly language dictionaries) - * that are cached globally -- surviving the Init() and - * End() of individual TessBaseAPI's. This function allows the - * clearing of these caches. - * - * @param handle the TesseractAPI instance - */ - void TessBaseAPIClearPersistentCache(TessBaseAPI handle); - - /** - * Detect the orientation of the input image and apparent script (alphabet). - * orient_deg is the detected clockwise rotation of the input image in - * degrees (0, 90, 180, 270); orient_conf is the confidence (15.0 is - * reasonably confident); script_name is an ASCII string, the name of the - * script, e.g. "Latin"; script_conf is confidence level in the script. - * - * @return TRUE on success and writes values to each parameter as an output - */ - int TessBaseAPIDetectOrientationScript(TessBaseAPI handle, IntBuffer orient_deg, FloatBuffer orient_conf, PointerByReference script_name, FloatBuffer script_conf); - - /** - * Gets the string of the specified unichar. - * - * @param handle the TesseractAPI instance - * @param unichar_id the unichar id - * @return the string form of the specified unichar. - */ - String TessBaseAPIGetUnichar(TessBaseAPI handle, int unichar_id); - - /** - * Deletes the specified PageIterator instance. - * - * @param handle the TessPageIterator instance - */ - void TessPageIteratorDelete(TessPageIterator handle); - - /** - * Creates a copy of the specified PageIterator instance. - * - * @param handle the TessPageIterator instance - * @return page iterator copy - */ - TessPageIterator TessPageIteratorCopy(TessPageIterator handle); - - /** - * Resets the iterator to point to the start of the page. - * - * @param handle the TessPageIterator instance - */ - void TessPageIteratorBegin(TessPageIterator handle); - - /** - * Moves to the start of the next object at the given level in the page - * hierarchy, and returns false if the end of the page was reached. NOTE - * (CHANGED!) that ALL PageIteratorLevel level values will visit each - * non-text block at least once.
- * Think of non text blocks as containing a single para, with at least one - * line, with a single imaginary word, containing a single symbol. The - * bounding boxes mark out any polygonal nature of the block, and - * PTIsTextType(BLockType()) is false for non-text blocks.
- * Calls to Next with different levels may be freely intermixed. This - * function iterates words in right-to-left scripts correctly, if the - * appropriate language has been loaded into Tesseract. - * - * @param handle the TessPageIterator instance - * @param level tesseract page level - * @return next iterator object - */ - int TessPageIteratorNext(TessPageIterator handle, int level); - - /** - * Returns TRUE if the iterator is at the start of an object at the given - * level. Possible uses include determining if a call to Next(RIL_WORD) - * moved to the start of a RIL_PARA. - * - * @param handle the TessPageIterator instance - * @param level tesseract page level - * @return 1 if true - */ - int TessPageIteratorIsAtBeginningOf(TessPageIterator handle, int level); - - /** - * Returns whether the iterator is positioned at the last element in a given - * level. (e.g. the last word in a line, the last line in a block). - * - * @param handle the TessPageIterator instance - * @param level tesseract page level - * @param element page iterator level - * @return 1 if true - */ - int TessPageIteratorIsAtFinalElement(TessPageIterator handle, int level, int element); - - /** - * Returns the bounding rectangle of the current object at the given level - * in coordinates of the original image. - * - * @param handle the TessPageIterator instance - * @param level tesseract page level - * @param left int buffer position - * @param top int buffer position - * @param right int buffer position - * @param bottom int buffer position - * @return FALSE if there is no such object at the current position - */ - int TessPageIteratorBoundingBox(TessPageIterator handle, int level, IntBuffer left, IntBuffer top, - IntBuffer right, IntBuffer bottom); - - /** - * Returns the type of the current block. - * - * @param handle the TessPageIterator instance - * @return TessPolyBlockType value - */ - int TessPageIteratorBlockType(TessPageIterator handle); - - /** - * Returns a binary image of the current object at the given level. The - * position and size match the return from BoundingBoxInternal, and so this - * could be upscaled with respect to the original input image. Use - * pixDestroy to delete the image after use. The following - * methods are used to generate the images: RIL_BLOCK: mask the - * page image with the block polygon. RIL_TEXTLINE: Clip the - * rectangle of the line box from the page image. TODO(rays) fix this to - * generate and use a line polygon. RIL_WORD: Clip the - * rectangle of the word box from the page image. RIL_SYMBOL: - * Render the symbol outline to an image for cblobs (prior to recognition) - * or the bounding box otherwise. A reconstruction of the original image - * (using xor to check for double representation) should be reasonably - * accurate, apart from removed noise, at the block level. Below the block - * level, the reconstruction will be missing images and line separators. At - * the symbol level, kerned characters will be invade the bounding box if - * rendered after recognition, making an xor reconstruction inaccurate, but - * an or construction better. Before recognition, symbol-level - * reconstruction should be good, even with xor, since the images come from - * the connected components. - * - * @param handle the TessPageIterator instance - * @param level PageIteratorLevel - * @return - */ - Pix TessPageIteratorGetBinaryImage(TessPageIterator handle, int level); - - /** - * Returns an image of the current object at the given level in greyscale if - * available in the input. To guarantee a binary image use BinaryImage. NOTE - * that in order to give the best possible image, the bounds are expanded - * slightly over the binary connected component, by the supplied padding, so - * the top-left position of the returned image is returned in (left,top). - * These will most likely not match the coordinates returned by BoundingBox. - * If you do not supply an original image, you will get a binary one. Use - * pixDestroy to delete the image after use. - * - * @param handle the TessPageIterator instance - * @param level PageIteratorLevel - * @param padding - * @param original_image - * @param left - * @param top - * @return - */ - Pix TessPageIteratorGetImage(TessPageIterator handle, int level, int padding, Pix original_image, IntBuffer left, IntBuffer top); - - /** - * Returns the baseline of the current object at the given level. The - * baseline is the line that passes through (x1, y1) and (x2, y2).
- * WARNING: with vertical text, baselines may be vertical! - * - * @param handle the TessPageIterator instance - * @param level PageIteratorLevel - * @param x1 int buffer position - * @param y1 int buffer position - * @param x2 int buffer position - * @param y2 int buffer position - * @return TRUE if the baseline is valid - */ - int TessPageIteratorBaseline(TessPageIterator handle, int level, IntBuffer x1, IntBuffer y1, IntBuffer x2, - IntBuffer y2); - - /** - * Returns the orientation. - * - * @param handle the TessPageIterator instance - * @param orientation orientation value - * @param writing_direction writing direction value - * @param textline_order text line order - * @param deskew_angle deskew angle - */ - void TessPageIteratorOrientation(TessPageIterator handle, IntBuffer orientation, - IntBuffer writing_direction, IntBuffer textline_order, FloatBuffer deskew_angle); - - /** - * Gets paragraph information. - * - * @param handle the TessPageIterator instance - * @param justification justification type - * @param is_list_item list item - * @param is_crown very first or continuation - * @param first_line_indent first line indentation - */ - void TessPageIteratorParagraphInfo(TessPageIterator handle, IntBuffer justification, - IntBuffer is_list_item, IntBuffer is_crown, IntBuffer first_line_indent); - - /** - * Deletes the specified ResultIterator handle. - * - * @param handle the TessResultIterator instance - */ - void TessResultIteratorDelete(TessResultIterator handle); - - /** - * Creates a copy of the specified ResultIterator instance. - * - * @param handle the TessResultIterator instance - * @return the copy object - */ - TessResultIterator TessResultIteratorCopy(TessResultIterator handle); - - /** - * Gets the PageIterator of the specified ResultIterator instance. - * - * @param handle the TessResultIterator instance - * @return the page iterator - */ - TessPageIterator TessResultIteratorGetPageIterator(TessResultIterator handle); - - /** - * Gets the PageIterator of the specified ResultIterator instance. - * - * @param handle the TessResultIterator instance - * @return the page iterator constant - */ - TessPageIterator TessResultIteratorGetPageIteratorConst(TessResultIterator handle); - - int TessResultIteratorNext(TessResultIterator handle, int level); - - /** - * Returns the null terminated UTF-8 encoded text string for the current - * object at the given level. Use delete [] to free after use. - * - * @param handle the TessResultIterator instance - * @param level tesseract page level - * @return the pointer to recognized text - */ - Pointer TessResultIteratorGetUTF8Text(TessResultIterator handle, int level); - - /** - * Returns the mean confidence of the current object at the given level. The - * number should be interpreted as a percent probability (0.0f-100.0f). - * - * @param handle the TessResultIterator instance - * @param level tesseract page level - * @return confidence value - */ - float TessResultIteratorConfidence(TessResultIterator handle, int level); - - String TessResultIteratorWordRecognitionLanguage(TessResultIterator handle); - - /** - * Returns the font attributes of the current word. If iterating at a higher - * level object than words, e.g., textlines, then this will return the - * attributes of the first word in that textline. The actual return value is - * a string representing a font name. It points to an internal table and - * SHOULD NOT BE DELETED. Lifespan is the same as the iterator itself, ie - * rendered invalid by various members of TessBaseAPI, including - * Init, SetImage, End or deleting - * the TessBaseAPI. Pointsize is returned in printers points (1/72 inch). - * - * @param handle the TessResultIterator instance - * @param is_bold font attribute - * @param is_italic font attribute - * @param is_underlined font attribute - * @param is_monospace font attribute - * @param is_serif font attribute - * @param is_smallcaps font attribute - * @param pointsize font attribute - * @param font_id font attribute - * @return font name - */ - String TessResultIteratorWordFontAttributes(TessResultIterator handle, IntBuffer is_bold, - IntBuffer is_italic, IntBuffer is_underlined, IntBuffer is_monospace, IntBuffer is_serif, - IntBuffer is_smallcaps, IntBuffer pointsize, IntBuffer font_id); - - /** - * Returns TRUE if the current word was found in a dictionary. - * - * @param handle the TessResultIterator instance - * @return 1 if word is from dictionary - */ - int TessResultIteratorWordIsFromDictionary(TessResultIterator handle); - - /** - * Returns TRUE if the current word is numeric. - * - * @param handle the TessResultIterator instance - * @return 1 if word is numeric - */ - int TessResultIteratorWordIsNumeric(TessResultIterator handle); - - /** - * Returns TRUE if the current symbol is a superscript. If iterating at a - * higher level object than symbols, e.g., words, then this will return the - * attributes of the first symbol in that word. - * - * @param handle the TessResultIterator instance - * @return 1 if symbol is superscript - */ - int TessResultIteratorSymbolIsSuperscript(TessResultIterator handle); - - /** - * Returns TRUE if the current symbol is a subscript. If iterating at a - * higher level object than symbols, e.g., words, then this will return the - * attributes of the first symbol in that word. - * - * @param handle the TessResultIterator instance - * @return 1 if symbol is subscript - */ - int TessResultIteratorSymbolIsSubscript(TessResultIterator handle); - - /** - * Returns TRUE if the current symbol is a dropcap. If iterating at a higher - * level object than symbols, e.g., words, then this will return the - * attributes of the first symbol in that word. - * - * @param handle the TessResultIterator instance - * @return 1 if symbol is dropcap - */ - int TessResultIteratorSymbolIsDropcap(TessResultIterator handle); - - /* Choice iterator */ - TessChoiceIterator TessResultIteratorGetChoiceIterator(TessResultIterator handle); - - void TessChoiceIteratorDelete(TessChoiceIterator handle); - - int TessChoiceIteratorNext(TessChoiceIterator handle); - - String TessChoiceIteratorGetUTF8Text(TessChoiceIterator handle); - - float TessChoiceIteratorConfidence(TessChoiceIterator handle); -} diff --git a/Tess4J/src/net/sourceforge/tess4j/TessAPI1.java b/Tess4J/src/net/sourceforge/tess4j/TessAPI1.java deleted file mode 100644 index c113b86..0000000 --- a/Tess4J/src/net/sourceforge/tess4j/TessAPI1.java +++ /dev/null @@ -1,1228 +0,0 @@ -/** - * Copyright @ 2012 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package net.sourceforge.tess4j; - -import java.nio.ByteBuffer; -import java.nio.DoubleBuffer; -import java.nio.FloatBuffer; -import java.nio.IntBuffer; - -import com.sun.jna.Library; -import com.sun.jna.Native; -import com.sun.jna.Pointer; -import com.sun.jna.ptr.IntByReference; -import com.sun.jna.ptr.PointerByReference; - -import com.ochafik.lang.jnaerator.runtime.NativeSize; -import net.sourceforge.lept4j.Boxa; -import net.sourceforge.lept4j.Pix; -import net.sourceforge.tess4j.util.LoadLibs; - -/** - * A Java wrapper for Tesseract OCR 3.04 API using - * JNA Direct Mapping. - */ -public class TessAPI1 implements Library, ITessAPI { - - static { - Native.register(LoadLibs.getTesseractLibName()); - } - - /** - * Gets the version identifier. - * - * @return the version identifier - */ - public static native String TessVersion(); - - /** - * Deallocates the memory block occupied by text. - * - * @param text the pointer to text - */ - public static native void TessDeleteText(Pointer text); - - /** - * Deallocates the memory block occupied by text array. - * - * @param arr text array pointer reference - */ - public static native void TessDeleteTextArray(PointerByReference arr); - - /** - * Deallocates the memory block occupied by integer array. - * - * @param arr int array - */ - public static native void TessDeleteIntArray(IntBuffer arr); - - /* Renderer API */ - public static native TessResultRenderer TessTextRendererCreate(String outputbase); - - public static native TessResultRenderer TessHOcrRendererCreate(String outputbase); - - public static native TessResultRenderer TessHOcrRendererCreate2(String outputbase, int font_info); - - public static native TessResultRenderer TessPDFRendererCreate(String outputbase, String datadir); - - public static native TessResultRenderer TessPDFRendererCreateTextonly(String outputbase, String datadir, int textonly); - - public static native TessResultRenderer TessUnlvRendererCreate(String outputbase); - - public static native TessResultRenderer TessBoxTextRendererCreate(String outputbase); - - public static native void TessDeleteResultRenderer(TessResultRenderer renderer); - - public static native void TessResultRendererInsert(TessResultRenderer renderer, TessResultRenderer next); - - public static native TessResultRenderer TessResultRendererNext(TessResultRenderer renderer); - - public static native int TessResultRendererBeginDocument(TessResultRenderer renderer, String title); - - public static native int TessResultRendererAddImage(TessResultRenderer renderer, PointerByReference api); - - public static native int TessResultRendererEndDocument(TessResultRenderer renderer); - - public static native Pointer TessResultRendererExtention(TessResultRenderer renderer); - - public static native Pointer TessResultRendererTitle(TessResultRenderer renderer); - - public static native int TessResultRendererImageNum(TessResultRenderer renderer); - - /** - * Creates an instance of the base class for all Tesseract APIs. - * - * @return the TesseractAPI instance - */ - public static native TessBaseAPI TessBaseAPICreate(); - - /** - * Disposes the TesseractAPI instance. - * - * @param handle the TesseractAPI instance - */ - public static native void TessBaseAPIDelete(TessBaseAPI handle); - - /** - * Set the name of the input file. Needed only for training and reading a - * UNLV zone file, and for searchable PDF output. - * - * @param handle the TesseractAPI instance - * @param name name of the input file - */ - public static native void TessBaseAPISetInputName(TessBaseAPI handle, String name); - - /** - * These functions are required for searchable PDF output. We need our hands - * on the input file so that we can include it in the PDF without - * transcoding. If that is not possible, we need the original image. - * Finally, resolution metadata is stored in the PDF so we need that as - * well. - * - * @param handle the TesseractAPI instance - * @return input file name - */ - public static native String TessBaseAPIGetInputName(TessBaseAPI handle); - - public static native void TessBaseAPISetInputImage(TessBaseAPI handle, Pix pix); - - public static native Pix TessBaseAPIGetInputImage(TessBaseAPI handle); - - public static native int TessBaseAPIGetSourceYResolution(TessBaseAPI handle); - - public static native String TessBaseAPIGetDatapath(TessBaseAPI handle); - - /** - * Set the name of the bonus output files. Needed only for debugging. - * - * @param handle the TesseractAPI instance - * @param name name of the output file - */ - public static native void TessBaseAPISetOutputName(TessBaseAPI handle, String name); - - /** - * Set the value of an internal "parameter." Supply the name of the - * parameter and the value as a string, just as you would in a config file. - * Returns false if the name lookup failed. E.g., - * SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, - * y and z. Or SetVariable("classify_bln_numeric_mode", "1"); - * to set numeric-only mode. SetVariable may be used before - * Init, but settings will revert to defaults on - * End().
- *
- * Note: Must be called after Init(). Only works for non-init - * variables (init variables should be passed to Init()). - * - * - * @param handle the TesseractAPI instance - * @param name name of the input - * @param value variable value - * @return 1 on success - */ - public static native int TessBaseAPISetVariable(TessBaseAPI handle, String name, String value); - - /** - * Get the value of an internal int parameter. - * - * @param handle the TesseractAPI instance - * @param name name of the input - * @param value pass the int buffer value - * @return 1 on success - */ - public static native int TessBaseAPIGetIntVariable(TessBaseAPI handle, String name, IntBuffer value); - - /** - * Get the value of an internal bool parameter. - * - * @param handle the TesseractAPI instance - * @param name pass the name of the variable - * @param value pass the int buffer value - * @return 1 on success - */ - public static native int TessBaseAPIGetBoolVariable(TessBaseAPI handle, String name, IntBuffer value); - - /** - * Get the value of an internal double parameter. - * - * @param handle the TesseractAPI instance - * @param name pass the name of the variable - * @param value pass the double buffer value - * @return 1 on success - */ - public static native int TessBaseAPIGetDoubleVariable(TessBaseAPI handle, String name, DoubleBuffer value); - - /** - * Get the value of an internal string parameter. - * - * @param handle the TesseractAPI instance - * @param name pass the name of the variable - * @return the string value - */ - public static native String TessBaseAPIGetStringVariable(TessBaseAPI handle, String name); - - /** - * Print Tesseract parameters to the given file.
- *
- * Note: Must not be the first method called after instance create. - * - * @param handle the TesseractAPI instance - * @param filename name of the file where the variables will be persisted - */ - public static native void TessBaseAPIPrintVariablesToFile(TessBaseAPI handle, String filename); - - /** - * Instances are now mostly thread-safe and totally independent, but some - * global parameters remain. Basically it is safe to use multiple - * TessBaseAPIs in different threads in parallel, UNLESS you use - * SetVariable on some of the Params in classify and textord. - * If you do, then the effect will be to change it for all your - * instances.
- *
- * Start tesseract. Returns zero on success and -1 on failure. NOTE that the - * only members that may be called before Init are those listed - * above here in the class definition.
- *
- * It is entirely safe (and eventually will be efficient too) to call - * Init multiple times on the same instance to change language, - * or just to reset the classifier. Languages may specify internally that - * they want to be loaded with one or more other languages, so the ~ - * sign is available to override that. E.g., if hin were set to - * load eng by default, then hin+~eng would force - * loading only hin. The number of loaded languages is limited - * only by memory, with the caveat that loading additional languages will - * impact both speed and accuracy, as there is more work to do to decide on - * the applicable language, and there is more chance of hallucinating - * incorrect words. WARNING: On changing languages, all Tesseract parameters - * are reset back to their default values. (Which may vary between - * languages.) If you have a rare need to set a Variable that controls - * initialization for a second call to Init you should - * explicitly call End() and then use SetVariable - * before Init.
- * This is only a very rare use case, since there are very few uses that - * require any parameters to be set before Init.
- *
- * If set_only_non_debug_params is true, only params that do - * not contain "debug" in the name will be set. - * - * @param handle the TesseractAPI instance - * @param datapath The datapath must be the name of the parent - * directory of tessdata and must end in - * /. Any name after the last / will be stripped. - * @param language The language is (usually) an ISO 639-3 - * string or NULL will default to eng. The - * language may be a string of the form [~]<lang>[+[~]<lang>] - * indicating that multiple languages are to be loaded. E.g., - * hin+eng will load Hindi and English. - * @param oem ocr engine mode - * @param configs pointer configuration - * @param configs_size pointer configuration size - * @return 0 on success and -1 on initialization failure - */ - public static native int TessBaseAPIInit1(TessBaseAPI handle, String datapath, String language, int oem, - PointerByReference configs, int configs_size); - - /** - * @param handle the TesseractAPI instance - * @param datapath The datapath must be the name of the parent - * directory of tessdata and must end in - * /. Any name after the last / will be stripped. - * @param language The language is (usually) an ISO 639-3 - * string or NULL will default to eng. The - * language may be a string of the form [~]<lang>[+[~]<lang>] - * indicating that multiple languages are to be loaded. E.g., - * hin+eng will load Hindi and English. - * @param oem ocr engine mode - * @return 0 on success and -1 on initialization failure - */ - public static native int TessBaseAPIInit2(TessBaseAPI handle, String datapath, String language, int oem); - - /** - * @param handle the TesseractAPI instance - * @param datapath The datapath must be the name of the parent - * directory of tessdata and must end in - * /. Any name after the last / will be stripped. - * @param language The language is (usually) an ISO 639-3 - * string or NULL will default to eng. The - * language may be a string of the form [~]<lang>[+[~]<lang>] - * indicating that multiple languages are to be loaded. E.g., - * hin+eng will load Hindi and English. - * @return 0 on success and -1 on initialization failure - */ - public static native int TessBaseAPIInit3(TessBaseAPI handle, String datapath, String language); - - /** - * - * @param handle the TesseractAPI instance - * @param datapath The datapath must be the name of the parent - * directory of tessdata and must end in - * /. Any name after the last / will be stripped. - * @param language The language is (usually) an ISO 639-3 - * string or NULL will default to eng. The - * language may be a string of the form [~]<lang>[+[~]<lang>] - * indicating that multiple languages are to be loaded. E.g., - * hin+eng will load Hindi and English. - * @param oem ocr engine mode - * @param configs pointer configuration - * @param configs_size pointer configuration size - * @param vars_vec - * @param vars_values - * @param vars_vec_size - * @param set_only_non_debug_params - * @return 0 on success and -1 on initialization failure - */ - public static native int TessBaseAPIInit4(TessBaseAPI handle, String datapath, String language, int oem, PointerByReference configs, int configs_size, PointerByReference vars_vec, PointerByReference vars_values, NativeSize vars_vec_size, int set_only_non_debug_params); - - /** - * Returns the languages string used in the last valid initialization. If - * the last initialization specified "deu+hin" then that will be returned. - * If hin loaded eng automatically as well, then - * that will not be included in this list. To find the languages actually - * loaded, use GetLoadedLanguagesAsVector. The returned string - * should NOT be deleted. - * - * @param handle the TesseractAPI instance - * @return languages as string - */ - public static native String TessBaseAPIGetInitLanguagesAsString(TessBaseAPI handle); - - /** - * Returns the loaded languages in the vector of STRINGs. Includes all - * languages loaded by the last Init, including those loaded as - * dependencies of other loaded languages. - * - * @param handle the TesseractAPI instance - * @return loaded languages as vector - */ - public static native PointerByReference TessBaseAPIGetLoadedLanguagesAsVector(TessBaseAPI handle); - - /** - * Returns the available languages in the vector of STRINGs. - * - * @param handle the TesseractAPI instance - * @return available languages as vector - */ - public static native PointerByReference TessBaseAPIGetAvailableLanguagesAsVector(TessBaseAPI handle); - - /** - * Init only the lang model component of Tesseract. The only functions that - * work after this init are SetVariable and - * IsValidWord. WARNING: temporary! This function will be - * removed from here and placed in a separate API at some future time. - * - * @param handle the TesseractAPI instance - * @param datapath The datapath must be the name of the parent - * directory of tessdata and must end in - * /. Any name after the last / will be stripped. - * @param language The language is (usually) an ISO 639-3 - * string or NULL will default to eng. The language may be a - * string of the form [~]<lang>[+[~]<lang>] indicating that - * multiple languages are to be loaded. E.g., hin+eng will load Hindi and - * English. - * @return api init language mode - */ - public static native int TessBaseAPIInitLangMod(TessBaseAPI handle, String datapath, String language); - - /** - * Init only for page layout analysis. Use only for calls to - * SetImage and AnalysePage. Calls that attempt - * recognition will generate an error. - * - * @param handle the TesseractAPI instance - */ - public static native void TessBaseAPIInitForAnalysePage(TessBaseAPI handle); - - /** - * Read a "config" file containing a set of param, value pairs. Searches the - * standard places: tessdata/configs, - * tessdata/tessconfigs and also accepts a relative or absolute - * path name. Note: only non-init params will be set (init params are set by - * Init()). - * - * - * @param handle the TesseractAPI instance - * @param filename relative or absolute path for the "config" file - * containing a set of param and value pairs - * @param init_only - */ - public static native void TessBaseAPIReadConfigFile(TessBaseAPI handle, String filename, int init_only); - - /** - * Set the current page segmentation mode. Defaults to - * PSM_SINGLE_BLOCK. The mode is stored as an IntParam so it - * can also be modified by ReadConfigFile or - * SetVariable("tessedit_pageseg_mode", mode as string). - * - * @param handle the TesseractAPI instance - * @param mode tesseract page segment mode - */ - public static native void TessBaseAPISetPageSegMode(TessBaseAPI handle, int mode); - - /** - * Return the current page segmentation mode. - * - * @param handle the TesseractAPI instance - * @return page segment mode value - */ - public static native int TessBaseAPIGetPageSegMode(TessBaseAPI handle); - - /** - * Recognize a rectangle from an image and return the result as a string. - * May be called many times for a single Init. Currently has no - * error checking. Greyscale of 8 and color of 24 or 32 bits per pixel may - * be given. Palette color images will not work properly and must be - * converted to 24 bit. Binary images of 1 bit per pixel may also be given - * but they must be byte packed with the MSB of the first byte being the - * first pixel, and a 1 represents WHITE. For binary images set - * bytes_per_pixel=0. The recognized text is returned as a char* which is - * coded as UTF8 and must be freed with the delete [] operator.
- *
- * Note that TesseractRect is the simplified convenience - * interface. For advanced uses, use SetImage, (optionally) - * SetRectangle, Recognize, and one or more of the - * Get*Text functions below. - * - * @param handle the TesseractAPI instance - * @param imagedata image byte buffer - * @param bytes_per_pixel bytes per pixel - * @param bytes_per_line bytes per line - * @param left image left - * @param top image top - * @param width image width - * @param height image height - * @return the pointer to recognized text - */ - public static native Pointer TessBaseAPIRect(TessBaseAPI handle, ByteBuffer imagedata, - int bytes_per_pixel, int bytes_per_line, int left, int top, int width, int height); - - /** - * Call between pages or documents etc to free up memory and forget adaptive - * data. - * - * @param handle the TesseractAPI instance - */ - public static native void TessBaseAPIClearAdaptiveClassifier(TessBaseAPI handle); - - /** - * Provide an image for Tesseract to recognize. Format is as - * TesseractRect above. Does not copy the image buffer, or take - * ownership. The source image may be destroyed after Recognize is called, - * either explicitly or implicitly via one of the Get*Text - * functions. SetImage clears all recognition results, and sets - * the rectangle to the full image, so it may be followed immediately by a - * GetUTF8Text, and it will automatically perform recognition. - * - * @param handle the TesseractAPI instance - * @param imagedata image byte buffer - * @param width image width - * @param height image height - * @param bytes_per_pixel bytes per pixel - * @param bytes_per_line bytes per line - */ - public static native void TessBaseAPISetImage(TessBaseAPI handle, ByteBuffer imagedata, int width, - int height, int bytes_per_pixel, int bytes_per_line); - - /** - * Provide an image for Tesseract to recognize. As with - * SetImage above, Tesseract doesn't take a copy or ownership - * or pixDestroy the image, so it must persist until after - * Recognize. Pix vs raw, which to use? Use - * Pix where possible. A future version of Tesseract may choose - * to use Pix as its internal representation and discard - * IMAGE altogether. Because of that, an implementation that - * sources and targets Pix may end up with less copies than an - * implementation that does not. - * - * @param handle the TesseractAPI instance - * @param pix - */ - public static native void TessBaseAPISetImage2(TessBaseAPI handle, Pix pix); - - /** - * Set the resolution of the source image in pixels per inch so font size - * information can be calculated in results. Call this after - * SetImage(). - * - * @param handle the TesseractAPI instance - * @param ppi source resolution value - */ - public static native void TessBaseAPISetSourceResolution(TessBaseAPI handle, int ppi); - - /** - * Restrict recognition to a sub-rectangle of the image. Call after - * SetImage. Each SetRectangle clears the - * recognition results so multiple rectangles can be recognized with the - * same image. - * - * @param handle the TesseractAPI instance - * @param left value - * @param top value - * @param width value - * @param height value - */ - public static native void TessBaseAPISetRectangle(TessBaseAPI handle, int left, int top, int width, - int height); - - /** - * ONLY available after SetImage if you have Leptonica - * installed. Get a copy of the internal thresholded image from Tesseract. - * - * @param handle the TesseractAPI instance - * @return internal thresholded image - */ - public static native Pix TessBaseAPIGetThresholdedImage(TessBaseAPI handle); - - /** - * Get the result of page layout analysis as a Leptonica-style - * Boxa, Pixa pair, in reading order. Can be - * called before or after Recognize. - * - * @param handle the TesseractAPI instance - * @param pixa array of Pix - * @return array of Box - */ - public static native Boxa TessBaseAPIGetRegions(TessBaseAPI handle, PointerByReference pixa); - - /** - * Get the textlines as a Leptonica-style Boxa, - * Pixa pair, in reading order. Can be called before or after - * Recognize. If blockids is not NULL, the - * block-id of each line is also returned as an array of one element per - * line. delete [] after use. If paraids is not - * NULL, the paragraph-id of each line within its block is also - * returned as an array of one element per line. delete [] after use.
- * Helper method to extract from the thresholded image (most common usage). - * - * @param handle the TesseractAPI instance - * @param pixa array of Pix - * @param blockids - * @return array of Box - */ - public static native Boxa TessBaseAPIGetTextlines(TessBaseAPI handle, PointerByReference pixa, PointerByReference blockids); - - /** - * Get the textlines as a Leptonica-style Boxa, - * Pixa pair, in reading order. Can be called before or after - * Recognize. If blockids is not NULL, the - * block-id of each line is also returned as an array of one element per - * line. delete [] after use. If paraids is not - * NULL, the paragraph-id of each line within its block is also - * returned as an array of one element per line. delete [] after use. - * - * @param handle the TesseractAPI instance - * @param raw_image - * @param raw_padding - * @param pixa array of Pix - * @param blockids - * @param paraids - * @return array of Box - */ - public static native Boxa TessBaseAPIGetTextlines1(TessBaseAPI handle, int raw_image, int raw_padding, PointerByReference pixa, PointerByReference blockids, PointerByReference paraids); - - /** - * Get textlines and strips of image regions as a Leptonica-style - * Boxa, Pixa pair, in reading order. Enables - * downstream handling of non-rectangular regions. Can be called before or - * after Recognize. If blockids is not NULL, the block-id of - * each line is also returned as an array of one element per line. delete [] - * after use. - * - * @param handle the TesseractAPI instance - * @param pixa array of Pix - * @param blockids - * @return array of Box - */ - public static native Boxa TessBaseAPIGetStrips(TessBaseAPI handle, PointerByReference pixa, PointerByReference blockids); - - /** - * Get the words as a Leptonica-style Boxa, Pixa - * pair, in reading order. Can be called before or after - * Recognize. - * - * @param handle the TesseractAPI instance - * @param pixa array of Pix - * @return array of Box - */ - public static native Boxa TessBaseAPIGetWords(TessBaseAPI handle, PointerByReference pixa); - - /** - * Gets the individual connected (text) components (created after pages - * segmentation step, but before recognition) as a Leptonica-style - * Boxa, Pixa pair, in reading order. Can be - * called before or after Recognize. - * - * @param handle the TesseractAPI instance - * @param cc array of Pix - * @return array of Box - */ - public static native Boxa TessBaseAPIGetConnectedComponents(TessBaseAPI handle, PointerByReference cc); - - /** - * Get the given level kind of components (block, textline, word etc.) as a - * Leptonica-style Boxa, Pixa pair, in reading - * order. Can be called before or after Recognize. If blockids - * is not NULL, the block-id of each component is also returned - * as an array of one element per component. delete [] after use. If - * text_only is true, then only text components are returned. - * Helper function to get binary images with no padding (most common usage). - * - * @param handle the TesseractAPI instance - * @param level PageIteratorLevel - * @param text_only - * @param pixa array of Pix - * @param blockids - * @return array of Box - */ - public static native Boxa TessBaseAPIGetComponentImages(TessBaseAPI handle, int level, int text_only, PointerByReference pixa, PointerByReference blockids); - - /** - * Get the given level kind of components (block, textline, word etc.) as a - * Leptonica-style Boxa, Pixa pair, in reading - * order. Can be called before or after Recognize. If blockids - * is not NULL, the block-id of each component is also returned - * as an array of one element per component. delete [] after use. If - * paraids is not NULL, the paragraph-id of each - * component with its block is also returned as an array of one element per - * component. delete [] after use. If raw_image is true, then - * portions of the original image are extracted instead of the thresholded - * image and padded with raw_padding. If text_only is true, - * then only text components are returned. - * - * @param handle the TesseractAPI instance - * @param level PageIteratorLevel - * @param text_only - * @param raw_image - * @param raw_padding - * @param pixa array of Pix - * @param blockids - * @param paraids - * @return - */ - public static native Boxa TessBaseAPIGetComponentImages1(TessBaseAPI handle, int level, int text_only, int raw_image, int raw_padding, PointerByReference pixa, PointerByReference blockids, PointerByReference paraids); - - /** - * @param handle the TesseractAPI instance - * @return Scale factor from original image. - */ - public static native int TessBaseAPIGetThresholdedImageScaleFactor(TessBaseAPI handle); - - /** - * Dump the internal binary image to a PGM file. - * - * @param handle the TesseractAPI instance - * @param filename pgm file name - */ - public static native void TessBaseAPIDumpPGM(TessBaseAPI handle, String filename); - - /** - * Runs page layout analysis in the mode set by SetPageSegMode. - * May optionally be called prior to Recognize to get access to - * just the page layout results. Returns an iterator to the results. Returns - * NULL on error. The returned iterator must be deleted after - * use. WARNING! This class points to data held within the - * TessBaseAPI class, and therefore can only be used while the - * TessBaseAPI class still exists and has not been subjected to - * a call of Init, SetImage, - * Recognize, Clear, End, DetectOS, - * or anything else that changes the internal PAGE_RES. - * - * @param handle the TesseractAPI instance - * @return returns an iterator to the results. Returns NULL on error. The - * returned iterator must be deleted after use. - */ - public static native TessPageIterator TessBaseAPIAnalyseLayout(TessBaseAPI handle); - - /** - * Recognize the image from SetAndThresholdImage, generating - * Tesseract internal structures. Returns 0 on success. Optional. The - * Get*Text functions below will call Recognize if - * needed. After Recognize, the output is kept internally until - * the next SetImage. - * - * @param handle the TesseractAPI instance - * @param monitor the result as Tesseract internal structures - * @return 0 on success - */ - public static native int TessBaseAPIRecognize(TessBaseAPI handle, ETEXT_DESC monitor); - - /** - * Variant on Recognize used for testing chopper. - * - * @param handle the TesseractAPI instance - * @param monitor the result as Tesseract internal structures - * @return 0 on success - */ - public static native int TessBaseAPIRecognizeForChopTest(TessBaseAPI handle, ETEXT_DESC monitor); - - /** - * Get a reading-order iterator to the results of LayoutAnalysis and/or - * Recognize. The returned iterator must be deleted after use. - * WARNING! This class points to data held within the - * TessBaseAPI class, and therefore can only be used while the - * TessBaseAPI class still exists and has not been subjected to - * a call of Init, SetImage, - * Recognize, Clear, End, DetectOS, - * or anything else that changes the internal PAGE_RES. - * - * @param handle the TesseractAPI instance - * @return the result iterator - */ - public static native TessResultIterator TessBaseAPIGetIterator(TessBaseAPI handle); - - /** - * Get a mutable iterator to the results of LayoutAnalysis and/or - * Recognize. The returned iterator must be deleted after use. - * WARNING! This class points to data held within the - * TessBaseAPI class, and therefore can only be used while the - * TessBaseAPI class still exists and has not been subjected to - * a call of Init, SetImage, - * Recognize, Clear, End, DetectOS, - * or anything else that changes the internal PAGE_RES. - * - * @param handle the TesseractAPI instance - * @return the mutable iterator - */ - public static native TessMutableIterator TessBaseAPIGetMutableIterator(TessBaseAPI handle); - - /** - * Recognizes all the pages in the named file, as a multi-page tiff or list - * of filenames, or single image, and gets the appropriate kind of text - * according to parameters: tessedit_create_boxfile, - * tessedit_make_boxes_from_boxes, - * tessedit_write_unlv, tessedit_create_hocr. - * Calls ProcessPage on each page in the input file, which may be a - * multi-page tiff, single-page other file format, or a plain text list of - * images to read. If tessedit_page_number is non-negative, processing - * begins at that page of a multi-page tiff file, or filelist. The text is - * returned in text_out. Returns false on error. If non-zero - * timeout_millisec terminates processing after the timeout on a single - * page. If non-NULL and non-empty, and some page fails for some reason, the - * page is reprocessed with the retry_config config file. Useful for - * interactively debugging a bad page. - * - * @param handle the TesseractAPI instance - * @param filename multi-page tiff or list of filenames - * @param retry_config retry config values - * @param timeout_millisec timeout value - * @param renderer result renderer - * @return the status - */ - public static native int TessBaseAPIProcessPages(TessBaseAPI handle, String filename, String retry_config, int timeout_millisec, TessResultRenderer renderer); - - public static native int TessBaseAPIProcessPage(TessBaseAPI handle, Pix pix, int page_index, String filename, String retry_config, int timeout_millisec, TessResultRenderer renderer); - - /** - * The recognized text is returned as a char* which is coded as UTF-8 and - * must be freed with the delete [] operator. - * - * @param handle the TesseractAPI instance - * @return the pointer to output text - */ - public static native Pointer TessBaseAPIGetUTF8Text(TessBaseAPI handle); - - /** - * Make a HTML-formatted string with hOCR markup from the internal data - * structures. page_number is 0-based but will appear in the output as - * 1-based. - * - * @param handle the TesseractAPI instance - * @param page_number page number - * @return the pointer to hOCR text - */ - public static native Pointer TessBaseAPIGetHOCRText(TessBaseAPI handle, int page_number); - - /** - * The recognized text is returned as a char* which is coded as a UTF8 box - * file and must be freed with the delete [] operator. page_number is a - * 0-base page index that will appear in the box file. - * - * @param handle the TesseractAPI instance - * @param page_number number of the page - * @return the pointer to box text - */ - public static native Pointer TessBaseAPIGetBoxText(TessBaseAPI handle, int page_number); - - /** - * The recognized text is returned as a char* which is coded as UNLV format - * Latin-1 with specific reject and suspect codes and must be freed with the - * delete [] operator. - * - * @param handle the TesseractAPI instance - * @return the pointer to UNLV text - */ - public static native Pointer TessBaseAPIGetUNLVText(TessBaseAPI handle); - - /** - * Returns the average word confidence for Tesseract page result. - * - * @param handle the TesseractAPI instance - * @return the (average) confidence value between 0 and 100. - */ - public static native int TessBaseAPIMeanTextConf(TessBaseAPI handle); - - /** - * Returns an array of all word confidences, terminated by -1. The calling - * function must delete [] after use. The number of confidences should - * correspond to the number of space-delimited words in - * GetUTF8Text. - * - * @param handle the TesseractAPI instance - * @return all word confidences (between 0 and 100) in an array, terminated - * by -1 - */ - public static native IntByReference TessBaseAPIAllWordConfidences(TessBaseAPI handle); - - /** - * Applies the given word to the adaptive classifier if possible. The word - * must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can tell the - * boundaries of the graphemes. Assumes that - * SetImage/SetRectangle have been used to set the - * image to the given word. The mode arg should be - * PSM_SINGLE_WORD or PSM_CIRCLE_WORD, as that - * will be used to control layout analysis. The currently set PageSegMode is - * preserved. - * - * @param handle the TesseractAPI instance - * @param mode tesseract page segment mode - * @param wordstr The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , - * so it can tell the boundaries of the graphemes. - * @return false if adaption was not possible for some reason. - */ - public static native int TessBaseAPIAdaptToWordStr(TessBaseAPI handle, int mode, String wordstr); - - /** - * Free up recognition results and any stored image data, without actually - * freeing any recognition data that would be time-consuming to reload. - * Afterwards, you must call SetImage or - * TesseractRect before doing any Recognize or - * Get* operation. - * - * @param handle the TesseractAPI instance - */ - public static native void TessBaseAPIClear(TessBaseAPI handle); - - /** - * Close down tesseract and free up all memory. End() is - * equivalent to destructing and reconstructing your TessBaseAPI. Once - * End() has been used, none of the other API functions may be - * used other than Init and anything declared above it in the - * class definition. - * - * @param handle the TesseractAPI instance - */ - public static native void TessBaseAPIEnd(TessBaseAPI handle); - - /** - * Check whether a word is valid according to Tesseract's language model. - * - * @param handle the TesseractAPI instance - * @param word word value - * @return 0 if the word is invalid, non-zero if valid - */ - public static native int TessBaseAPIIsValidWord(TessBaseAPI handle, String word); - - /** - * Gets text direction. - * - * @param handle the TesseractAPI instance - * @param out_offset offset - * @param out_slope slope - * @return TRUE if text direction is valid - */ - public static native int TessBaseAPIGetTextDirection(TessBaseAPI handle, IntBuffer out_offset, - FloatBuffer out_slope); - - /** - * Clear any library-level memory caches. There are a variety of - * expensive-to-load constant data structures (mostly language dictionaries) - * that are cached globally -- surviving the Init() and - * End() of individual TessBaseAPI's. This function allows the - * clearing of these caches. - * - * @param handle the TesseractAPI instance - */ - public static native void TessBaseAPIClearPersistentCache(TessBaseAPI handle); - - /** - * Detect the orientation of the input image and apparent script (alphabet). - * orient_deg is the detected clockwise rotation of the input image in - * degrees (0, 90, 180, 270); orient_conf is the confidence (15.0 is - * reasonably confident); script_name is an ASCII string, the name of the - * script, e.g. "Latin"; script_conf is confidence level in the script. - * - * @return TRUE on success and writes values to each parameter as an output - */ - public static native int TessBaseAPIDetectOrientationScript(TessBaseAPI handle, IntBuffer orient_deg, FloatBuffer orient_conf, PointerByReference script_name, FloatBuffer script_conf); - - /** - * Gets the string of the specified unichar. - * - * @param handle the TesseractAPI instance - * @param unichar_id the unichar id - * @return the string form of the specified unichar. - */ - public static native String TessBaseAPIGetUnichar(TessBaseAPI handle, int unichar_id); - - /** - * Deletes the specified PageIterator instance. - * - * @param handle the TessPageIterator instance - */ - public static native void TessPageIteratorDelete(TessPageIterator handle); - - /** - * Creates a copy of the specified PageIterator instance. - * - * @param handle the TessPageIterator instance - * @return page iterator copy - */ - public static native TessPageIterator TessPageIteratorCopy(TessPageIterator handle); - - /** - * Resets the iterator to point to the start of the page. - * - * @param handle the TessPageIterator instance - */ - public static native void TessPageIteratorBegin(TessPageIterator handle); - - /** - * Moves to the start of the next object at the given level in the page - * hierarchy, and returns false if the end of the page was reached. NOTE - * (CHANGED!) that ALL PageIteratorLevel level values will visit each - * non-text block at least once.
- * Think of non text blocks as containing a single para, with at least one - * line, with a single imaginary word, containing a single symbol. The - * bounding boxes mark out any polygonal nature of the block, and - * PTIsTextType(BLockType()) is false for non-text blocks.
- * Calls to Next with different levels may be freely intermixed. This - * function iterates words in right-to-left scripts correctly, if the - * appropriate language has been loaded into Tesseract. - * - * @param handle the TessPageIterator instance - * @param level tesseract page level - * @return next iterator object - */ - public static native int TessPageIteratorNext(TessPageIterator handle, int level); - - /** - * Returns TRUE if the iterator is at the start of an object at the given - * level. Possible uses include determining if a call to Next(RIL_WORD) - * moved to the start of a RIL_PARA. - * - * @param handle the TessPageIterator instance - * @param level tesseract page level - * @return 1 if true - */ - public static native int TessPageIteratorIsAtBeginningOf(TessPageIterator handle, int level); - - /** - * Returns whether the iterator is positioned at the last element in a given - * level. (e.g. the last word in a line, the last line in a block). - * - * @param handle the TessPageIterator instance - * @param level tesseract page level - * @param element page iterator level - * @return 1 if true - */ - public static native int TessPageIteratorIsAtFinalElement(TessPageIterator handle, int level, int element); - - /** - * Returns the bounding rectangle of the current object at the given level - * in coordinates of the original image. - * - * @param handle the TessPageIterator instance - * @param level tesseract page level - * @param left int buffer position - * @param top int buffer position - * @param right int buffer position - * @param bottom int buffer position - * @return FALSE if there is no such object at the current position - */ - public static native int TessPageIteratorBoundingBox(TessPageIterator handle, int level, IntBuffer left, - IntBuffer top, IntBuffer right, IntBuffer bottom); - - /** - * Returns the type of the current block. - * - * @param handle the TessPageIterator instance - * @return TessPolyBlockType value - */ - public static native int TessPageIteratorBlockType(TessPageIterator handle); - - /** - * Returns a binary image of the current object at the given level. The - * position and size match the return from BoundingBoxInternal, and so this - * could be upscaled with respect to the original input image. Use - * pixDestroy to delete the image after use. The following - * methods are used to generate the images: RIL_BLOCK: mask the - * page image with the block polygon. RIL_TEXTLINE: Clip the - * rectangle of the line box from the page image. TODO(rays) fix this to - * generate and use a line polygon. RIL_WORD: Clip the - * rectangle of the word box from the page image. RIL_SYMBOL: - * Render the symbol outline to an image for cblobs (prior to recognition) - * or the bounding box otherwise. A reconstruction of the original image - * (using xor to check for double representation) should be reasonably - * accurate, apart from removed noise, at the block level. Below the block - * level, the reconstruction will be missing images and line separators. At - * the symbol level, kerned characters will be invade the bounding box if - * rendered after recognition, making an xor reconstruction inaccurate, but - * an or construction better. Before recognition, symbol-level - * reconstruction should be good, even with xor, since the images come from - * the connected components. - * - * @param handle the TessPageIterator instance - * @param level PageIteratorLevel - * @return - */ - public static native Pix TessPageIteratorGetBinaryImage(TessPageIterator handle, int level); - - /** - * Returns an image of the current object at the given level in greyscale if - * available in the input. To guarantee a binary image use BinaryImage. NOTE - * that in order to give the best possible image, the bounds are expanded - * slightly over the binary connected component, by the supplied padding, so - * the top-left position of the returned image is returned in (left,top). - * These will most likely not match the coordinates returned by BoundingBox. - * If you do not supply an original image, you will get a binary one. Use - * pixDestroy to delete the image after use. - * - * @param handle the TessPageIterator instance - * @param level PageIteratorLevel - * @param padding - * @param original_image - * @param left - * @param top - * @return - */ - public static native Pix TessPageIteratorGetImage(TessPageIterator handle, int level, int padding, Pix original_image, IntBuffer left, IntBuffer top); - - /** - * Returns the baseline of the current object at the given level. The - * baseline is the line that passes through (x1, y1) and (x2, y2).
- * WARNING: with vertical text, baselines may be vertical! - * - * @param handle the TessPageIterator instance - * @param level PageIteratorLevel - * @param x1 int buffer position - * @param y1 int buffer position - * @param x2 int buffer position - * @param y2 int buffer position - * @return TRUE if the baseline is valid - */ - public static native int TessPageIteratorBaseline(TessPageIterator handle, int level, IntBuffer x1, - IntBuffer y1, IntBuffer x2, IntBuffer y2); - - /** - * Returns the orientation. - * - * @param handle the TessPageIterator instance - * @param orientation orientation value - * @param writing_direction writing direction value - * @param textline_order text line order - * @param deskew_angle deskew angle - */ - public static native void TessPageIteratorOrientation(TessPageIterator handle, IntBuffer orientation, - IntBuffer writing_direction, IntBuffer textline_order, FloatBuffer deskew_angle); - - /** - * Gets paragraph information. - * - * @param handle the TessPageIterator instance - * @param justification justification type - * @param is_list_item list item - * @param is_crown very first or continuation - * @param first_line_indent first line indentation - */ - public static native void TessPageIteratorParagraphInfo(TessPageIterator handle, IntBuffer justification, - IntBuffer is_list_item, IntBuffer is_crown, IntBuffer first_line_indent); - - /** - * Deletes the specified ResultIterator handle. - * - * @param handle the TessResultIterator instance - */ - public static native void TessResultIteratorDelete(TessResultIterator handle); - - /** - * Creates a copy of the specified ResultIterator instance. - * - * @param handle the TessResultIterator instance - * @return the copy object - */ - public static native TessResultIterator TessResultIteratorCopy(TessResultIterator handle); - - /** - * Gets the PageIterator of the specified ResultIterator instance. - * - * @param handle the TessResultIterator instance - * @return the page iterator - */ - public static native TessPageIterator TessResultIteratorGetPageIterator(TessResultIterator handle); - - /** - * Gets the PageIterator of the specified ResultIterator instance. - * - * @param handle the TessResultIterator instance - * @return the page iterator constant - */ - public static native TessPageIterator TessResultIteratorGetPageIteratorConst( - TessResultIterator handle); - - public static native int TessResultIteratorNext(TessResultIterator handle, int level); - - /** - * Returns the null terminated UTF-8 encoded text string for the current - * object at the given level. Use delete [] to free after use. - * - * @param handle the TessResultIterator instance - * @param level tesseract page level - * @return the pointer to recognized text - */ - public static native Pointer TessResultIteratorGetUTF8Text(TessResultIterator handle, int level); - - /** - * Returns the mean confidence of the current object at the given level. The - * number should be interpreted as a percent probability (0.0f-100.0f). - * - * @param handle the TessResultIterator instance - * @param level tesseract page level - * @return confidence value - */ - public static native float TessResultIteratorConfidence(TessResultIterator handle, int level); - - public static native String TessResultIteratorWordRecognitionLanguage(TessResultIterator handle); - - /** - * Returns the font attributes of the current word. If iterating at a higher - * level object than words, e.g., textlines, then this will return the - * attributes of the first word in that textline. The actual return value is - * a string representing a font name. It points to an internal table and - * SHOULD NOT BE DELETED. Lifespan is the same as the iterator itself, ie - * rendered invalid by various members of TessBaseAPI, including - * Init, SetImage, End or deleting - * the TessBaseAPI. Pointsize is returned in printers points (1/72 inch). - * - * @param handle the TessResultIterator instance - * @param is_bold font attribute - * @param is_italic font attribute - * @param is_underlined font attribute - * @param is_monospace font attribute - * @param is_serif font attribute - * @param is_smallcaps font attribute - * @param pointsize font attribute - * @param font_id font attribute - * @return font name - */ - public static native String TessResultIteratorWordFontAttributes(TessResultIterator handle, - IntBuffer is_bold, IntBuffer is_italic, IntBuffer is_underlined, IntBuffer is_monospace, - IntBuffer is_serif, IntBuffer is_smallcaps, IntBuffer pointsize, IntBuffer font_id); - - /** - * Returns TRUE if the current word was found in a dictionary. - * - * @param handle the TessResultIterator instance - * @return 1 if word is from dictionary - */ - public static native int TessResultIteratorWordIsFromDictionary(TessResultIterator handle); - - /** - * Returns TRUE if the current word is numeric. - * - * @param handle the TessResultIterator instance - * @return 1 if word is numeric - */ - public static native int TessResultIteratorWordIsNumeric(TessResultIterator handle); - - /** - * Returns TRUE if the current symbol is a superscript. If iterating at a - * higher level object than symbols, e.g., words, then this will return the - * attributes of the first symbol in that word. - * - * @param handle the TessResultIterator instance - * @return 1 if symbol is superscript - */ - public static native int TessResultIteratorSymbolIsSuperscript(TessResultIterator handle); - - /** - * Returns TRUE if the current symbol is a subscript. If iterating at a - * higher level object than symbols, e.g., words, then this will return the - * attributes of the first symbol in that word. - * - * @param handle the TessResultIterator instance - * @return 1 if symbol is subscript - */ - public static native int TessResultIteratorSymbolIsSubscript(TessResultIterator handle); - - /** - * Returns TRUE if the current symbol is a dropcap. If iterating at a higher - * level object than symbols, e.g., words, then this will return the - * attributes of the first symbol in that word. - * - * @param handle the TessResultIterator instance - * @return 1 if symbol is dropcap - */ - public static native int TessResultIteratorSymbolIsDropcap(TessResultIterator handle); - - /* Choice iterator */ - public static native TessChoiceIterator TessResultIteratorGetChoiceIterator(TessResultIterator handle); - - public static native void TessChoiceIteratorDelete(TessChoiceIterator handle); - - public static native int TessChoiceIteratorNext(TessChoiceIterator handle); - - public static native String TessChoiceIteratorGetUTF8Text(TessChoiceIterator handle); - - public static native float TessChoiceIteratorConfidence(TessChoiceIterator handle); -} diff --git a/Tess4J/src/net/sourceforge/tess4j/Tesseract.java b/Tess4J/src/net/sourceforge/tess4j/Tesseract.java deleted file mode 100644 index 2410e27..0000000 --- a/Tess4J/src/net/sourceforge/tess4j/Tesseract.java +++ /dev/null @@ -1,682 +0,0 @@ -/** - * Copyright @ 2012 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package net.sourceforge.tess4j; - -import com.sun.jna.Pointer; -import com.sun.jna.StringArray; -import com.sun.jna.ptr.PointerByReference; -import java.awt.Rectangle; -import java.awt.image.*; -import java.io.*; -import java.nio.ByteBuffer; -import java.nio.IntBuffer; -import java.util.*; -import javax.imageio.IIOImage; -import net.sourceforge.lept4j.Box; -import net.sourceforge.lept4j.Boxa; -import static net.sourceforge.lept4j.ILeptonica.L_CLONE; -import net.sourceforge.lept4j.Leptonica; -import static net.sourceforge.tess4j.ITessAPI.TRUE; - -import net.sourceforge.tess4j.ITessAPI.TessBaseAPI; -import net.sourceforge.tess4j.ITessAPI.TessOcrEngineMode; -import net.sourceforge.tess4j.ITessAPI.TessPageIterator; -import net.sourceforge.tess4j.ITessAPI.TessResultIterator; -import net.sourceforge.tess4j.ITessAPI.TessResultRenderer; - -import net.sourceforge.tess4j.util.ImageIOHelper; -import net.sourceforge.tess4j.util.LoggHelper; -import net.sourceforge.tess4j.util.PdfUtilities; -import org.slf4j.*; - -/** - * An object layer on top of TessAPI, provides character - * recognition support for common image formats, and multi-page TIFF images - * beyond the uncompressed, binary TIFF format supported by Tesseract OCR - * engine. The extended capabilities are provided by the - * Java Advanced Imaging Image I/O Tools.
- *
- * Support for PDF documents is available through Ghost4J, a - * JNA wrapper for GPL Ghostscript, which should be - * installed and included in system path.
- *
- * Any program that uses the library will need to ensure that the required - * libraries (the .jar files for jna, - * jai-imageio, and ghost4j) are in its compile and - * run-time classpath. - */ -public class Tesseract implements ITesseract { - - private static Tesseract instance; - private String language = "eng"; - private String datapath; - private RenderedFormat renderedFormat = RenderedFormat.TEXT; - private int psm = -1; - private int ocrEngineMode = TessOcrEngineMode.OEM_DEFAULT; - private final Properties prop = new Properties(); - private final List configList = new ArrayList(); - - private TessAPI api; - private TessBaseAPI handle; - - private static final org.slf4j.Logger logger = LoggerFactory.getLogger(new LoggHelper().toString()); - - public Tesseract() { - try { - datapath = System.getenv("TESSDATA_PREFIX"); - } catch (Exception e) { - // ignore - } finally { - if (datapath == null) { - datapath = "./"; - } - } - } - - /** - * Returns TessAPI object. - * - * @return api - */ - protected TessAPI getAPI() { - return api; - } - - /** - * Returns API handle. - * - * @return handle - */ - protected TessBaseAPI getHandle() { - return handle; - } - - /** - * Gets an instance of the class library. - * - * @deprecated As of Release 2.0, use default constructor instead. - * @return instance - */ - @Deprecated - public static synchronized Tesseract getInstance() { - if (instance == null) { - instance = new Tesseract(); - } - - return instance; - } - - /** - * Sets path to tessdata. - * - * @param datapath the tessdata path to set - */ - @Override - public void setDatapath(String datapath) { - this.datapath = datapath; - } - - /** - * Sets language for OCR. - * - * @param language the language code, which follows ISO 639-3 standard. - */ - @Override - public void setLanguage(String language) { - this.language = language; - } - - /** - * Sets OCR engine mode. - * - * @param ocrEngineMode the OcrEngineMode to set - */ - @Override - public void setOcrEngineMode(int ocrEngineMode) { - this.ocrEngineMode = ocrEngineMode; - } - - /** - * Sets page segmentation mode. - * - * @param mode the page segmentation mode to set - */ - @Override - public void setPageSegMode(int mode) { - this.psm = mode; - } - - /** - * Enables hocr output. - * - * @param hocr to enable or disable hocr output - */ - public void setHocr(boolean hocr) { - this.renderedFormat = hocr ? RenderedFormat.HOCR : RenderedFormat.TEXT; - prop.setProperty("tessedit_create_hocr", hocr ? "1" : "0"); - } - - /** - * Set the value of Tesseract's internal parameter. - * - * @param key variable name, e.g., tessedit_create_hocr, - * tessedit_char_whitelist, etc. - * @param value value for corresponding variable, e.g., "1", "0", - * "0123456789", etc. - */ - @Override - public void setTessVariable(String key, String value) { - prop.setProperty(key, value); - } - - /** - * Sets configs to be passed to Tesseract's Init method. - * - * @param configs list of config filenames, e.g., "digits", "bazaar", - * "quiet" - */ - @Override - public void setConfigs(List configs) { - configList.clear(); - if (configs != null) { - configList.addAll(configs); - } - } - - /** - * Performs OCR operation. - * - * @param imageFile an image file - * @return the recognized text - * @throws TesseractException - */ - @Override - public String doOCR(File imageFile) throws TesseractException { - return doOCR(imageFile, null); - } - - /** - * Performs OCR operation. - * - * @param imageFile an image file - * @param rect the bounding rectangle defines the region of the image to be - * recognized. A rectangle of zero dimension or null indicates - * the whole image. - * @return the recognized text - * @throws TesseractException - */ - @Override - public String doOCR(File imageFile, Rectangle rect) throws TesseractException { - try { - return doOCR(ImageIOHelper.getIIOImageList(imageFile), imageFile.getPath(), rect); - } catch (Exception e) { - logger.error(e.getMessage(), e); - throw new TesseractException(e); - } - } - - /** - * Performs OCR operation. - * - * @param bi a buffered image - * @return the recognized text - * @throws TesseractException - */ - @Override - public String doOCR(BufferedImage bi) throws TesseractException { - return doOCR(bi, null); - } - - /** - * Performs OCR operation. - * - * @param bi a buffered image - * @param rect the bounding rectangle defines the region of the image to be - * recognized. A rectangle of zero dimension or null indicates - * the whole image. - * @return the recognized text - * @throws TesseractException - */ - @Override - public String doOCR(BufferedImage bi, Rectangle rect) throws TesseractException { - try { - return doOCR(ImageIOHelper.getIIOImageList(bi), rect); - } catch (Exception e) { - logger.error(e.getMessage(), e); - throw new TesseractException(e); - } - } - - /** - * Performs OCR operation. - * - * @param imageList a list of IIOImage objects - * @param rect the bounding rectangle defines the region of the image to be - * recognized. A rectangle of zero dimension or null indicates - * the whole image. - * @return the recognized text - * @throws TesseractException - */ - @Override - public String doOCR(List imageList, Rectangle rect) throws TesseractException { - return doOCR(imageList, null, rect); - } - - /** - * Performs OCR operation. - * - * @param imageList a list of IIOImage objects - * @param filename input file name. Needed only for training and reading a - * UNLV zone file. - * @param rect the bounding rectangle defines the region of the image to be - * recognized. A rectangle of zero dimension or null indicates - * the whole image. - * @return the recognized text - * @throws TesseractException - */ - @Override - public String doOCR(List imageList, String filename, Rectangle rect) throws TesseractException { - init(); - setTessVariables(); - - try { - StringBuilder sb = new StringBuilder(); - int pageNum = 0; - - for (IIOImage oimage : imageList) { - pageNum++; - try { - setImage(oimage.getRenderedImage(), rect); - sb.append(getOCRText(filename, pageNum)); - } catch (IOException ioe) { - // skip the problematic image - logger.error(ioe.getMessage(), ioe); - } - } - - if (renderedFormat == RenderedFormat.HOCR) { - sb.insert(0, htmlBeginTag).append(htmlEndTag); - } - - return sb.toString(); - } finally { - dispose(); - } - } - - /** - * Performs OCR operation. Use SetImage, (optionally) - * SetRectangle, and one or more of the Get*Text - * functions. - * - * @param xsize width of image - * @param ysize height of image - * @param buf pixel data - * @param rect the bounding rectangle defines the region of the image to be - * recognized. A rectangle of zero dimension or null indicates - * the whole image. - * @param bpp bits per pixel, represents the bit depth of the image, with 1 - * for binary bitmap, 8 for gray, and 24 for color RGB. - * @return the recognized text - * @throws TesseractException - */ - @Override - public String doOCR(int xsize, int ysize, ByteBuffer buf, Rectangle rect, int bpp) throws TesseractException { - return doOCR(xsize, ysize, buf, null, rect, bpp); - } - - /** - * Performs OCR operation. Use SetImage, (optionally) - * SetRectangle, and one or more of the Get*Text - * functions. - * - * @param xsize width of image - * @param ysize height of image - * @param buf pixel data - * @param filename input file name. Needed only for training and reading a - * UNLV zone file. - * @param rect the bounding rectangle defines the region of the image to be - * recognized. A rectangle of zero dimension or null indicates - * the whole image. - * @param bpp bits per pixel, represents the bit depth of the image, with 1 - * for binary bitmap, 8 for gray, and 24 for color RGB. - * @return the recognized text - * @throws TesseractException - */ - @Override - public String doOCR(int xsize, int ysize, ByteBuffer buf, String filename, Rectangle rect, int bpp) throws TesseractException { - init(); - setTessVariables(); - - try { - setImage(xsize, ysize, buf, rect, bpp); - return getOCRText(filename, 1); - } catch (Exception e) { - logger.error(e.getMessage(), e); - throw new TesseractException(e); - } finally { - dispose(); - } - } - - /** - * Initializes Tesseract engine. - */ - protected void init() { - api = TessAPI.INSTANCE; - handle = api.TessBaseAPICreate(); - StringArray sarray = new StringArray(configList.toArray(new String[0])); - PointerByReference configs = new PointerByReference(); - configs.setPointer(sarray); - api.TessBaseAPIInit1(handle, datapath, language, ocrEngineMode, configs, configList.size()); - if (psm > -1) { - api.TessBaseAPISetPageSegMode(handle, psm); - } - } - - /** - * Sets Tesseract's internal parameters. - */ - protected void setTessVariables() { - Enumeration em = prop.propertyNames(); - while (em.hasMoreElements()) { - String key = (String) em.nextElement(); - api.TessBaseAPISetVariable(handle, key, prop.getProperty(key)); - } - } - - /** - * A wrapper for {@link #setImage(int, int, ByteBuffer, Rectangle, int)}. - * - * @param image a rendered image - * @param rect region of interest - * @throws java.io.IOException - */ - protected void setImage(RenderedImage image, Rectangle rect) throws IOException { - setImage(image.getWidth(), image.getHeight(), ImageIOHelper.getImageByteBuffer(image), rect, image - .getColorModel().getPixelSize()); - } - - /** - * Sets image to be processed. - * - * @param xsize width of image - * @param ysize height of image - * @param buf pixel data - * @param rect the bounding rectangle defines the region of the image to be - * recognized. A rectangle of zero dimension or null indicates - * the whole image. - * @param bpp bits per pixel, represents the bit depth of the image, with 1 - * for binary bitmap, 8 for gray, and 24 for color RGB. - */ - protected void setImage(int xsize, int ysize, ByteBuffer buf, Rectangle rect, int bpp) { - int bytespp = bpp / 8; - int bytespl = (int) Math.ceil(xsize * bpp / 8.0); - api.TessBaseAPISetImage(handle, buf, xsize, ysize, bytespp, bytespl); - - if (rect != null && !rect.isEmpty()) { - api.TessBaseAPISetRectangle(handle, rect.x, rect.y, rect.width, rect.height); - } - } - - /** - * Gets recognized text. - * - * @param filename input file name. Needed only for reading a UNLV zone - * file. - * @param pageNum page number; needed for hocr paging. - * @return the recognized text - */ - protected String getOCRText(String filename, int pageNum) { - if (filename != null && !filename.isEmpty()) { - api.TessBaseAPISetInputName(handle, filename); - } - - Pointer utf8Text = renderedFormat == RenderedFormat.HOCR ? api.TessBaseAPIGetHOCRText(handle, pageNum - 1) : api.TessBaseAPIGetUTF8Text(handle); - String str = utf8Text.getString(0); - api.TessDeleteText(utf8Text); - return str; - } - - /** - * Creates renderers for given formats. - * - * @param outputbase - * @param formats - * @return - */ - private TessResultRenderer createRenderers(String outputbase, List formats) { - TessResultRenderer renderer = null; - - for (RenderedFormat format : formats) { - switch (format) { - case TEXT: - if (renderer == null) { - renderer = api.TessTextRendererCreate(outputbase); - } else { - api.TessResultRendererInsert(renderer, api.TessTextRendererCreate(outputbase)); - } - break; - case HOCR: - if (renderer == null) { - renderer = api.TessHOcrRendererCreate(outputbase); - } else { - api.TessResultRendererInsert(renderer, api.TessHOcrRendererCreate(outputbase)); - } - break; - case PDF: - String dataPath = api.TessBaseAPIGetDatapath(handle); - if (renderer == null) { - renderer = api.TessPDFRendererCreate(outputbase, dataPath); - } else { - api.TessResultRendererInsert(renderer, api.TessPDFRendererCreate(outputbase, dataPath)); - } - break; - case BOX: - if (renderer == null) { - renderer = api.TessBoxTextRendererCreate(outputbase); - } else { - api.TessResultRendererInsert(renderer, api.TessBoxTextRendererCreate(outputbase)); - } - break; - case UNLV: - if (renderer == null) { - renderer = api.TessUnlvRendererCreate(outputbase); - } else { - api.TessResultRendererInsert(renderer, api.TessUnlvRendererCreate(outputbase)); - } - break; - } - } - - return renderer; - } - - /** - * Creates documents for given renderer. - * - * @param filename input image - * @param outputbase output filename without extension - * @param formats types of renderer - * @throws TesseractException - */ - @Override - public void createDocuments(String filename, String outputbase, List formats) throws TesseractException { - createDocuments(new String[]{filename}, new String[]{outputbase}, formats); - } - - /** - * Creates documents. - * - * @param filenames array of input files - * @param outputbases array of output filenames without extension - * @param formats types of renderer - * @throws TesseractException - */ - @Override - public void createDocuments(String[] filenames, String[] outputbases, List formats) throws TesseractException { - if (filenames.length != outputbases.length) { - throw new RuntimeException("The two arrays must match in length."); - } - - init(); - setTessVariables(); - - try { - for (int i = 0; i < filenames.length; i++) { - File workingTiffFile = null; - try { - String filename = filenames[i]; - - // if PDF, convert to multi-page TIFF - if (filename.toLowerCase().endsWith(".pdf")) { - workingTiffFile = PdfUtilities.convertPdf2Tiff(new File(filename)); - filename = workingTiffFile.getPath(); - } - - TessResultRenderer renderer = createRenderers(outputbases[i], formats); - createDocuments(filename, renderer); - api.TessDeleteResultRenderer(renderer); - } catch (Exception e) { - // skip the problematic image file - logger.error(e.getMessage(), e); - } finally { - if (workingTiffFile != null && workingTiffFile.exists()) { - workingTiffFile.delete(); - } - } - } - } finally { - dispose(); - } - } - - /** - * Creates documents. - * - * @param filename input file - * @param renderer renderer - * @throws TesseractException - */ - private void createDocuments(String filename, TessResultRenderer renderer) throws TesseractException { - api.TessBaseAPISetInputName(handle, filename); //for reading a UNLV zone file - int result = api.TessBaseAPIProcessPages(handle, filename, null, 0, renderer); - - if (result == ITessAPI.FALSE) { - throw new TesseractException("Error during processing page."); - } - } - - /** - * Gets segmented regions at specified page iterator level. - * - * @param bi input image - * @param pageIteratorLevel TessPageIteratorLevel enum - * @return list of Rectangle - * @throws TesseractException - */ - @Override - public List getSegmentedRegions(BufferedImage bi, int pageIteratorLevel) throws TesseractException { - init(); - setTessVariables(); - - try { - List list = new ArrayList(); - setImage(bi, null); - - Boxa boxes = api.TessBaseAPIGetComponentImages(handle, pageIteratorLevel, TRUE, null, null); - Leptonica leptInstance = Leptonica.INSTANCE; - int boxCount = leptInstance.boxaGetCount(boxes); - for (int i = 0; i < boxCount; i++) { - Box box = leptInstance.boxaGetBox(boxes, i, L_CLONE); - if (box == null) { - continue; - } - list.add(new Rectangle(box.x, box.y, box.w, box.h)); - PointerByReference pRef = new PointerByReference(); - pRef.setValue(box.getPointer()); - leptInstance.boxDestroy(pRef); - } - - PointerByReference pRef = new PointerByReference(); - pRef.setValue(boxes.getPointer()); - leptInstance.boxaDestroy(pRef); - - return list; - } catch (IOException ioe) { - // skip the problematic image - logger.error(ioe.getMessage(), ioe); - throw new TesseractException(ioe); - } finally { - dispose(); - } - } - - /** - * Gets recognized words at specified page iterator level. - * - * @param bi input image - * @param pageIteratorLevel TessPageIteratorLevel enum - * @return list of Word - */ - @Override - public List getWords(BufferedImage bi, int pageIteratorLevel) { - this.init(); - this.setTessVariables(); - - List words = new ArrayList(); - - try { - setImage(bi, null); - - api.TessBaseAPIRecognize(handle, null); - TessResultIterator ri = api.TessBaseAPIGetIterator(handle); - TessPageIterator pi = api.TessResultIteratorGetPageIterator(ri); - api.TessPageIteratorBegin(pi); - - do { - Pointer ptr = api.TessResultIteratorGetUTF8Text(ri, pageIteratorLevel); - String text = ptr.getString(0); - api.TessDeleteText(ptr); - float confidence = api.TessResultIteratorConfidence(ri, pageIteratorLevel); - IntBuffer leftB = IntBuffer.allocate(1); - IntBuffer topB = IntBuffer.allocate(1); - IntBuffer rightB = IntBuffer.allocate(1); - IntBuffer bottomB = IntBuffer.allocate(1); - api.TessPageIteratorBoundingBox(pi, pageIteratorLevel, leftB, topB, rightB, bottomB); - int left = leftB.get(); - int top = topB.get(); - int right = rightB.get(); - int bottom = bottomB.get(); - Word word = new Word(text, confidence, new Rectangle(left, top, right - left, bottom - top)); - words.add(word); - } while (api.TessPageIteratorNext(pi, pageIteratorLevel) == TRUE); - - return words; - } catch (Exception e) { - return words; - } finally { - dispose(); - } - } - - /** - * Releases all of the native resources used by this instance. - */ - protected void dispose() { - api.TessBaseAPIDelete(handle); - } -} diff --git a/Tess4J/src/net/sourceforge/tess4j/Tesseract1.java b/Tess4J/src/net/sourceforge/tess4j/Tesseract1.java deleted file mode 100644 index 369ff89..0000000 --- a/Tess4J/src/net/sourceforge/tess4j/Tesseract1.java +++ /dev/null @@ -1,647 +0,0 @@ -/** - * Copyright @ 2012 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package net.sourceforge.tess4j; - -import com.sun.jna.Pointer; -import com.sun.jna.StringArray; -import com.sun.jna.ptr.PointerByReference; -import java.awt.Rectangle; -import java.awt.image.*; -import java.io.*; -import java.nio.ByteBuffer; -import java.nio.IntBuffer; -import java.util.*; -import javax.imageio.IIOImage; -import net.sourceforge.lept4j.Box; -import net.sourceforge.lept4j.Boxa; -import static net.sourceforge.lept4j.ILeptonica.L_CLONE; -import net.sourceforge.lept4j.Leptonica1; -import static net.sourceforge.tess4j.ITessAPI.TRUE; - -import net.sourceforge.tess4j.util.ImageIOHelper; -import net.sourceforge.tess4j.util.LoggHelper; -import net.sourceforge.tess4j.util.PdfUtilities; -import org.slf4j.*; - -/** - * An object layer on top of TessAPI1, provides character - * recognition support for common image formats, and multi-page TIFF images - * beyond the uncompressed, binary TIFF format supported by Tesseract OCR - * engine. The extended capabilities are provided by the - * Java Advanced Imaging Image I/O Tools.
- *
- * Support for PDF documents is available through Ghost4J, a - * JNA wrapper for GPL Ghostscript, which should be - * installed and included in system path.
- *
- * Any program that uses the library will need to ensure that the required - * libraries (the .jar files for jna, - * jai-imageio, and ghost4j) are in its compile and - * run-time classpath. - */ -public class Tesseract1 extends TessAPI1 implements ITesseract { - - private String language = "eng"; - private String datapath; - private RenderedFormat renderedFormat = RenderedFormat.TEXT; - private int psm = -1; - private int ocrEngineMode = TessOcrEngineMode.OEM_DEFAULT; - private final Properties prop = new Properties(); - private final List configList = new ArrayList(); - - private TessBaseAPI handle; - - private static final org.slf4j.Logger logger = LoggerFactory.getLogger(new LoggHelper().toString()); - - public Tesseract1() { - try { - datapath = System.getenv("TESSDATA_PREFIX"); - } catch (Exception e) { - // ignore - } finally { - if (datapath == null) { - datapath = "./"; - } - } - } - - /** - * Returns API handle. - * - * @return handle - */ - protected TessBaseAPI getHandle() { - return handle; - } - - /** - * Sets path to tessdata. - * - * @param datapath the tessdata path to set - */ - @Override - public void setDatapath(String datapath) { - this.datapath = datapath; - } - - /** - * Sets language for OCR. - * - * @param language the language code, which follows ISO 639-3 standard. - */ - @Override - public void setLanguage(String language) { - this.language = language; - } - - /** - * Sets OCR engine mode. - * - * @param ocrEngineMode the OcrEngineMode to set - */ - @Override - public void setOcrEngineMode(int ocrEngineMode) { - this.ocrEngineMode = ocrEngineMode; - } - - /** - * Sets page segmentation mode. - * - * @param mode the page segmentation mode to set - */ - @Override - public void setPageSegMode(int mode) { - this.psm = mode; - } - - /** - * Enables hocr output. - * - * @param hocr to enable or disable hocr output - */ - public void setHocr(boolean hocr) { - this.renderedFormat = hocr ? RenderedFormat.HOCR : RenderedFormat.TEXT; - prop.setProperty("tessedit_create_hocr", hocr ? "1" : "0"); - } - - /** - * Set the value of Tesseract's internal parameter. - * - * @param key variable name, e.g., tessedit_create_hocr, - * tessedit_char_whitelist, etc. - * @param value value for corresponding variable, e.g., "1", "0", - * "0123456789", etc. - */ - @Override - public void setTessVariable(String key, String value) { - prop.setProperty(key, value); - } - - /** - * Sets configs to be passed to Tesseract's Init method. - * - * @param configs list of config filenames, e.g., "digits", "bazaar", - * "quiet" - */ - @Override - public void setConfigs(List configs) { - configList.clear(); - if (configs != null) { - configList.addAll(configs); - } - } - - /** - * Performs OCR operation. - * - * @param imageFile an image file - * @return the recognized text - * @throws TesseractException - */ - @Override - public String doOCR(File imageFile) throws TesseractException { - return doOCR(imageFile, null); - } - - /** - * Performs OCR operation. - * - * @param imageFile an image file - * @param rect the bounding rectangle defines the region of the image to be - * recognized. A rectangle of zero dimension or null indicates - * the whole image. - * @return the recognized text - * @throws TesseractException - */ - @Override - public String doOCR(File imageFile, Rectangle rect) throws TesseractException { - try { - return doOCR(ImageIOHelper.getIIOImageList(imageFile), imageFile.getPath(), rect); - } catch (Exception e) { - logger.error(e.getMessage(), e); - throw new TesseractException(e); - } - } - - /** - * Performs OCR operation. - * - * @param bi a buffered image - * @return the recognized text - * @throws TesseractException - */ - @Override - public String doOCR(BufferedImage bi) throws TesseractException { - return doOCR(bi, null); - } - - /** - * Performs OCR operation. - * - * @param bi a buffered image - * @param rect the bounding rectangle defines the region of the image to be - * recognized. A rectangle of zero dimension or null indicates - * the whole image. - * @return the recognized text - * @throws TesseractException - */ - @Override - public String doOCR(BufferedImage bi, Rectangle rect) throws TesseractException { - try { - return doOCR(ImageIOHelper.getIIOImageList(bi), rect); - } catch (Exception e) { - logger.error(e.getMessage(), e); - throw new TesseractException(e); - } - } - - /** - * Performs OCR operation. - * - * @param imageList a list of IIOImage objects - * @param rect the bounding rectangle defines the region of the image to be - * recognized. A rectangle of zero dimension or null indicates - * the whole image. - * @return the recognized text - * @throws TesseractException - */ - @Override - public String doOCR(List imageList, Rectangle rect) throws TesseractException { - return doOCR(imageList, null, rect); - } - - /** - * Performs OCR operation. - * - * @param imageList a list of IIOImage objects - * @param filename input file name - * @param rect the bounding rectangle defines the region of the image to be - * recognized. A rectangle of zero dimension or null indicates - * the whole image. - * @return the recognized text - * @throws TesseractException - */ - @Override - public String doOCR(List imageList, String filename, Rectangle rect) throws TesseractException { - init(); - setTessVariables(); - - try { - StringBuilder sb = new StringBuilder(); - int pageNum = 0; - - for (IIOImage oimage : imageList) { - pageNum++; - try { - setImage(oimage.getRenderedImage(), rect); - sb.append(getOCRText(filename, pageNum)); - } catch (IOException ioe) { - // skip the problematic image - logger.error(ioe.getMessage(), ioe); - } - } - - if (renderedFormat == RenderedFormat.HOCR) { - sb.insert(0, htmlBeginTag).append(htmlEndTag); - } - - return sb.toString(); - } finally { - dispose(); - } - } - - /** - * Performs OCR operation. Use SetImage, (optionally) - * SetRectangle, and one or more of the Get*Text - * functions. - * - * @param xsize width of image - * @param ysize height of image - * @param buf pixel data - * @param rect the bounding rectangle defines the region of the image to be - * recognized. A rectangle of zero dimension or null indicates - * the whole image. - * @param bpp bits per pixel, represents the bit depth of the image, with 1 - * for binary bitmap, 8 for gray, and 24 for color RGB. - * @return the recognized text - * @throws TesseractException - */ - @Override - public String doOCR(int xsize, int ysize, ByteBuffer buf, Rectangle rect, int bpp) throws TesseractException { - return doOCR(xsize, ysize, buf, null, rect, bpp); - } - - /** - * Performs OCR operation. Use SetImage, (optionally) - * SetRectangle, and one or more of the Get*Text - * functions. - * - * @param xsize width of image - * @param ysize height of image - * @param buf pixel data - * @param filename input file name. Needed only for training and reading a - * UNLV zone file. - * @param rect the bounding rectangle defines the region of the image to be - * recognized. A rectangle of zero dimension or null indicates - * the whole image. - * @param bpp bits per pixel, represents the bit depth of the image, with 1 - * for binary bitmap, 8 for gray, and 24 for color RGB. - * @return the recognized text - * @throws TesseractException - */ - @Override - public String doOCR(int xsize, int ysize, ByteBuffer buf, String filename, Rectangle rect, int bpp) throws TesseractException { - init(); - setTessVariables(); - - try { - setImage(xsize, ysize, buf, rect, bpp); - return getOCRText(filename, 1); - } catch (Exception e) { - logger.error(e.getMessage(), e); - throw new TesseractException(e); - } finally { - dispose(); - } - } - - /** - * Initializes Tesseract engine. - */ - protected void init() { - handle = TessBaseAPICreate(); - StringArray sarray = new StringArray(configList.toArray(new String[0])); - PointerByReference configs = new PointerByReference(); - configs.setPointer(sarray); - TessBaseAPIInit1(handle, datapath, language, ocrEngineMode, configs, configList.size()); - if (psm > -1) { - TessBaseAPISetPageSegMode(handle, psm); - } - } - - /** - * Sets Tesseract's internal parameters. - */ - protected void setTessVariables() { - Enumeration em = prop.propertyNames(); - while (em.hasMoreElements()) { - String key = (String) em.nextElement(); - TessBaseAPISetVariable(handle, key, prop.getProperty(key)); - } - } - - /** - * A wrapper for {@link #setImage(int, int, ByteBuffer, Rectangle, int)}. - * - * @param image a rendered image - * @param rect region of interest - * @throws java.io.IOException - */ - protected void setImage(RenderedImage image, Rectangle rect) throws IOException { - setImage(image.getWidth(), image.getHeight(), ImageIOHelper.getImageByteBuffer(image), rect, image - .getColorModel().getPixelSize()); - } - - /** - * Sets image to be processed. - * - * @param xsize width of image - * @param ysize height of image - * @param buf pixel data - * @param rect the bounding rectangle defines the region of the image to be - * recognized. A rectangle of zero dimension or null indicates - * the whole image. - * @param bpp bits per pixel, represents the bit depth of the image, with 1 - * for binary bitmap, 8 for gray, and 24 for color RGB. - */ - protected void setImage(int xsize, int ysize, ByteBuffer buf, Rectangle rect, int bpp) { - int bytespp = bpp / 8; - int bytespl = (int) Math.ceil(xsize * bpp / 8.0); - TessBaseAPISetImage(handle, buf, xsize, ysize, bytespp, bytespl); - - if (rect != null && !rect.isEmpty()) { - TessBaseAPISetRectangle(handle, rect.x, rect.y, rect.width, rect.height); - } - } - - /** - * Gets recognized text. - * - * @param filename input file name. Needed only for reading a UNLV zone - * file. - * @param pageNum page number; needed for hocr paging. - * @return the recognized text - */ - protected String getOCRText(String filename, int pageNum) { - if (filename != null && !filename.isEmpty()) { - TessBaseAPISetInputName(handle, filename); - } - - Pointer utf8Text = renderedFormat == RenderedFormat.HOCR ? TessBaseAPIGetHOCRText(handle, pageNum - 1) : TessBaseAPIGetUTF8Text(handle); - String str = utf8Text.getString(0); - TessDeleteText(utf8Text); - return str; - } - - /** - * Creates renderers for given formats. - * - * @param outputbase - * @param formats - * @return - */ - private TessResultRenderer createRenderers(String outputbase, List formats) { - TessResultRenderer renderer = null; - - for (RenderedFormat format : formats) { - switch (format) { - case TEXT: - if (renderer == null) { - renderer = TessTextRendererCreate(outputbase); - } else { - TessResultRendererInsert(renderer, TessTextRendererCreate(outputbase)); - } - break; - case HOCR: - if (renderer == null) { - renderer = TessHOcrRendererCreate(outputbase); - } else { - TessResultRendererInsert(renderer, TessHOcrRendererCreate(outputbase)); - } - break; - case PDF: - String dataPath = TessBaseAPIGetDatapath(handle); - if (renderer == null) { - renderer = TessPDFRendererCreate(outputbase, dataPath); - } else { - TessResultRendererInsert(renderer, TessPDFRendererCreate(outputbase, dataPath)); - } - break; - case BOX: - if (renderer == null) { - renderer = TessBoxTextRendererCreate(outputbase); - } else { - TessResultRendererInsert(renderer, TessBoxTextRendererCreate(outputbase)); - } - break; - case UNLV: - if (renderer == null) { - renderer = TessUnlvRendererCreate(outputbase); - } else { - TessResultRendererInsert(renderer, TessUnlvRendererCreate(outputbase)); - } - break; - } - } - - return renderer; - } - - /** - * Creates documents for given renderer. - * - * @param filename input image - * @param outputbase output filename without extension - * @param formats types of renderer - * @throws TesseractException - */ - @Override - public void createDocuments(String filename, String outputbase, List formats) throws TesseractException { - createDocuments(new String[]{filename}, new String[]{outputbase}, formats); - } - - /** - * Creates documents. - * - * @param filenames array of input files - * @param outputbases array of output filenames without extension - * @param formats types of renderer - * @throws TesseractException - */ - @Override - public void createDocuments(String[] filenames, String[] outputbases, List formats) throws TesseractException { - if (filenames.length != outputbases.length) { - throw new RuntimeException("The two arrays must match in length."); - } - - init(); - setTessVariables(); - - try { - for (int i = 0; i < filenames.length; i++) { - File workingTiffFile = null; - try { - String filename = filenames[i]; - - // if PDF, convert to multi-page TIFF - if (filename.toLowerCase().endsWith(".pdf")) { - workingTiffFile = PdfUtilities.convertPdf2Tiff(new File(filename)); - filename = workingTiffFile.getPath(); - } - - TessResultRenderer renderer = createRenderers(outputbases[i], formats); - createDocuments(filename, renderer); - TessDeleteResultRenderer(renderer); - } catch (Exception e) { - // skip the problematic image file - logger.error(e.getMessage(), e); - } finally { - if (workingTiffFile != null && workingTiffFile.exists()) { - workingTiffFile.delete(); - } - } - } - } finally { - dispose(); - } - } - - /** - * Creates documents. - * - * @param filename input file - * @param renderer renderer - * @throws TesseractException - */ - private void createDocuments(String filename, TessResultRenderer renderer) throws TesseractException { - TessBaseAPISetInputName(handle, filename); //for reading a UNLV zone file - int result = TessBaseAPIProcessPages(handle, filename, null, 0, renderer); - -// if (result == ITessAPI.FALSE) { -// throw new TesseractException("Error during processing page."); -// } - } - - /** - * Gets segmented regions at specified page iterator level. - * - * @param bi input image - * @param pageIteratorLevel TessPageIteratorLevel enum - * @return list of Rectangle - * @throws TesseractException - */ - @Override - public List getSegmentedRegions(BufferedImage bi, int pageIteratorLevel) throws TesseractException { - init(); - setTessVariables(); - - try { - List list = new ArrayList(); - setImage(bi, null); - - Boxa boxes = TessBaseAPIGetComponentImages(handle, pageIteratorLevel, TRUE, null, null); - int boxCount = Leptonica1.boxaGetCount(boxes); - for (int i = 0; i < boxCount; i++) { - Box box = Leptonica1.boxaGetBox(boxes, i, L_CLONE); - if (box == null) { - continue; - } - list.add(new Rectangle(box.x, box.y, box.w, box.h)); - PointerByReference pRef = new PointerByReference(); - pRef.setValue(box.getPointer()); - Leptonica1.boxDestroy(pRef); - } - - PointerByReference pRef = new PointerByReference(); - pRef.setValue(boxes.getPointer()); - Leptonica1.boxaDestroy(pRef); - - return list; - } catch (IOException ioe) { - // skip the problematic image - logger.error(ioe.getMessage(), ioe); - throw new TesseractException(ioe); - } finally { - dispose(); - } - } - - /** - * Gets recognized words at specified page iterator level. - * - * @param bi input image - * @param pageIteratorLevel TessPageIteratorLevel enum - * @return list of Word - */ - @Override - public List getWords(BufferedImage bi, int pageIteratorLevel) { - this.init(); - this.setTessVariables(); - - List words = new ArrayList(); - - try { - setImage(bi, null); - - TessBaseAPIRecognize(handle, null); - TessResultIterator ri = TessBaseAPIGetIterator(handle); - TessPageIterator pi = TessResultIteratorGetPageIterator(ri); - TessPageIteratorBegin(pi); - - do { - Pointer ptr = TessResultIteratorGetUTF8Text(ri, pageIteratorLevel); - String text = ptr.getString(0); - TessAPI1.TessDeleteText(ptr); - float confidence = TessResultIteratorConfidence(ri, pageIteratorLevel); - IntBuffer leftB = IntBuffer.allocate(1); - IntBuffer topB = IntBuffer.allocate(1); - IntBuffer rightB = IntBuffer.allocate(1); - IntBuffer bottomB = IntBuffer.allocate(1); - TessPageIteratorBoundingBox(pi, pageIteratorLevel, leftB, topB, rightB, bottomB); - int left = leftB.get(); - int top = topB.get(); - int right = rightB.get(); - int bottom = bottomB.get(); - Word word = new Word(text, confidence, new Rectangle(left, top, right - left, bottom - top)); - words.add(word); - } while (TessPageIteratorNext(pi, pageIteratorLevel) == TRUE); - - return words; - } catch (Exception e) { - return words; - } finally { - dispose(); - } - } - - /** - * Releases all of the native resources used by this instance. - */ - protected void dispose() { - TessBaseAPIDelete(handle); - } -} diff --git a/Tess4J/src/net/sourceforge/tess4j/TesseractException.java b/Tess4J/src/net/sourceforge/tess4j/TesseractException.java deleted file mode 100644 index 3ae7070..0000000 --- a/Tess4J/src/net/sourceforge/tess4j/TesseractException.java +++ /dev/null @@ -1,35 +0,0 @@ -/** - * Copyright @ 2010 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package net.sourceforge.tess4j; - -public class TesseractException extends Exception { - - public TesseractException() { - super(); - } - - public TesseractException(String message) { - super(message); - } - - public TesseractException(Throwable cause) { - super(cause); - } - - public TesseractException(String message, Throwable cause) { - super(message, cause); - } -} diff --git a/Tess4J/src/net/sourceforge/tess4j/Word.java b/Tess4J/src/net/sourceforge/tess4j/Word.java deleted file mode 100644 index 8bff259..0000000 --- a/Tess4J/src/net/sourceforge/tess4j/Word.java +++ /dev/null @@ -1,67 +0,0 @@ -/** - * Copyright @ 2015 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package net.sourceforge.tess4j; - -import java.awt.Rectangle; - -/** - * Encapsulates Tesseract OCR results. - */ -public class Word { - - private final String text; - private final float confidence; - private final Rectangle rect; - - /** - * Constructor. - * - * @param text - * @param confidence - * @param boundingBox - */ - public Word(String text, float confidence, Rectangle boundingBox) { - this.text = text; - this.confidence = confidence; - this.rect = boundingBox; - } - - /** - * @return the text - */ - public String getText() { - return text; - } - - /** - * @return the confidence - */ - public float getConfidence() { - return confidence; - } - - /** - * @return the bounding box - */ - public Rectangle getBoundingBox() { - return rect; - } - - @Override - public String toString() { - return String.format("%s [Confidence: %f Bounding box: %d %d %d %d]", text, confidence, rect.x, rect.y, rect.width, rect.height); - } -} diff --git a/Tess4J/src/net/sourceforge/tess4j/util/ImageHelper.java b/Tess4J/src/net/sourceforge/tess4j/util/ImageHelper.java deleted file mode 100644 index b117b90..0000000 --- a/Tess4J/src/net/sourceforge/tess4j/util/ImageHelper.java +++ /dev/null @@ -1,216 +0,0 @@ -/** - * Copyright @ 2008 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package net.sourceforge.tess4j.util; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.awt.Graphics2D; -import java.awt.Image; -import java.awt.RenderingHints; -import java.awt.Toolkit; -import java.awt.Transparency; -import java.awt.datatransfer.Clipboard; -import java.awt.datatransfer.DataFlavor; -import java.awt.image.*; -import javax.imageio.IIOImage; - -public class ImageHelper { - - private static final Logger logger = LoggerFactory.getLogger(new LoggHelper().toString()); - - /** - * Convenience method that returns a scaled instance of the provided - * {@code BufferedImage}. - * - * @param image the original image to be scaled - * @param targetWidth the desired width of the scaled instance, in pixels - * @param targetHeight the desired height of the scaled instance, in pixels - * @return a scaled version of the original {@code BufferedImage} - */ - public static BufferedImage getScaledInstance(BufferedImage image, int targetWidth, int targetHeight) { - int type = (image.getTransparency() == Transparency.OPAQUE) - ? BufferedImage.TYPE_INT_RGB : BufferedImage.TYPE_INT_ARGB; - BufferedImage tmp = new BufferedImage(targetWidth, targetHeight, type); - Graphics2D g2 = tmp.createGraphics(); - g2.setRenderingHint(RenderingHints.KEY_INTERPOLATION, RenderingHints.VALUE_INTERPOLATION_BICUBIC); - g2.drawImage(image, 0, 0, targetWidth, targetHeight, null); - g2.dispose(); - return tmp; - } - - /** - * Convenience method that returns a scaled instance of the provided - * {@code IIOImage}. - * - * @param iioSource the original image to be scaled - * @param scale the desired scale - * @return a scaled version of the original {@code IIOImage} - */ - public static IIOImage getScaledInstance(IIOImage iioSource, float scale) { - if (!(iioSource.getRenderedImage() instanceof BufferedImage)) { - throw new IllegalArgumentException("RenderedImage in IIOImage must be BufferedImage"); - } - - if (Math.abs(scale - 1.0) < 0.001) { - return iioSource; - } - - BufferedImage source = (BufferedImage) iioSource.getRenderedImage(); - BufferedImage target = getScaledInstance(source, (int) (scale * source.getWidth()), (int) (scale * source.getHeight())); - return new IIOImage(target, null, null); - } - - /** - * A replacement for the standard BufferedImage.getSubimage - * method. - * - * @param image - * @param x the X coordinate of the upper-left corner of the specified - * rectangular region - * @param y the Y coordinate of the upper-left corner of the specified - * rectangular region - * @param width the width of the specified rectangular region - * @param height the height of the specified rectangular region - * @return a BufferedImage that is the subimage of image. - */ - public static BufferedImage getSubImage(BufferedImage image, int x, int y, int width, int height) { - int type = (image.getTransparency() == Transparency.OPAQUE) - ? BufferedImage.TYPE_INT_RGB : BufferedImage.TYPE_INT_ARGB; - BufferedImage tmp = new BufferedImage(width, height, type); - Graphics2D g2 = tmp.createGraphics(); - g2.drawImage(image.getSubimage(x, y, width, height), 0, 0, null); - g2.dispose(); - return tmp; - } - - /** - * A simple method to convert an image to binary or B/W image. - * - * @param image input image - * @return a monochrome image - */ - public static BufferedImage convertImageToBinary(BufferedImage image) { - BufferedImage tmp = new BufferedImage(image.getWidth(), image.getHeight(), BufferedImage.TYPE_BYTE_BINARY); - Graphics2D g2 = tmp.createGraphics(); - g2.drawImage(image, 0, 0, null); - g2.dispose(); - return tmp; - } - - /** - * A simple method to convert an image to binary or B/W image. - * - * @param image input image - * @return a monochrome image - * @deprecated As of release 1.1, renamed to - * {@link #convertImageToBinary(BufferedImage image)} - */ - @Deprecated - public static BufferedImage convertImage2Binary(BufferedImage image) { - return convertImageToBinary(image); - } - - /** - * A simple method to convert an image to gray scale. - * - * @param image input image - * @return a monochrome image - */ - public static BufferedImage convertImageToGrayscale(BufferedImage image) { - BufferedImage tmp = new BufferedImage(image.getWidth(), image.getHeight(), BufferedImage.TYPE_BYTE_GRAY); - Graphics2D g2 = tmp.createGraphics(); - g2.drawImage(image, 0, 0, null); - g2.dispose(); - return tmp; - } - - private static final short[] invertTable; - - static { - invertTable = new short[256]; - for (int i = 0; i < 256; i++) { - invertTable[i] = (short) (255 - i); - } - } - - /** - * Inverts image color. - * - * @param image input image - * @return an inverted-color image - */ - public static BufferedImage invertImageColor(BufferedImage image) { - BufferedImage tmp = new BufferedImage(image.getWidth(), image.getHeight(), image.getType()); - BufferedImageOp invertOp = new LookupOp(new ShortLookupTable(0, invertTable), null); - return invertOp.filter(image, tmp); - } - - /** - * Rotates an image. - * - * @param image the original image - * @param angle the degree of rotation - * @return a rotated image - */ - public static BufferedImage rotateImage(BufferedImage image, double angle) { - double theta = Math.toRadians(angle); - double sin = Math.abs(Math.sin(theta)); - double cos = Math.abs(Math.cos(theta)); - int w = image.getWidth(); - int h = image.getHeight(); - int newW = (int) Math.floor(w * cos + h * sin); - int newH = (int) Math.floor(h * cos + w * sin); - - BufferedImage tmp = new BufferedImage(newW, newH, image.getType()); - Graphics2D g2d = tmp.createGraphics(); - g2d.setRenderingHint(RenderingHints.KEY_INTERPOLATION, - RenderingHints.VALUE_INTERPOLATION_BICUBIC); - g2d.translate((newW - w) / 2, (newH - h) / 2); - g2d.rotate(theta, w / 2, h / 2); - g2d.drawImage(image, 0, 0, null); - g2d.dispose(); - return tmp; - } - - /** - * Gets an image from Clipboard. - * - * @return image - */ - public static Image getClipboardImage() { - Clipboard clipboard = Toolkit.getDefaultToolkit().getSystemClipboard(); - try { - return (Image) clipboard.getData(DataFlavor.imageFlavor); - } catch (Exception e) { - return null; - } - } - - /** - * Clones an image. - * http://stackoverflow.com/questions/3514158/how-do-you-clone-a-bufferedimage - * - * @param bi - * @return - */ - public static BufferedImage cloneImage(BufferedImage bi) { - ColorModel cm = bi.getColorModel(); - boolean isAlphaPremultiplied = cm.isAlphaPremultiplied(); - WritableRaster raster = bi.copyData(null); - return new BufferedImage(cm, raster, isAlphaPremultiplied, null); - } -} diff --git a/Tess4J/src/net/sourceforge/tess4j/util/ImageIOHelper.java b/Tess4J/src/net/sourceforge/tess4j/util/ImageIOHelper.java deleted file mode 100644 index e6bb7cc..0000000 --- a/Tess4J/src/net/sourceforge/tess4j/util/ImageIOHelper.java +++ /dev/null @@ -1,643 +0,0 @@ -/** - * Copyright @ 2008 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package net.sourceforge.tess4j.util; - -import java.io.*; - -import java.util.*; -import javax.imageio.*; -import javax.imageio.stream.*; -import javax.imageio.metadata.*; -import java.awt.Toolkit; -import java.awt.image.*; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; - -import org.w3c.dom.NodeList; - -import com.github.jaiimageio.plugins.tiff.*; -import com.recognition.software.jdeskew.ImageDeskew; -import com.recognition.software.jdeskew.ImageUtil; -import org.apache.commons.io.FilenameUtils; - -public class ImageIOHelper { - - final static String OUTPUT_FILE_NAME = "Tesstmp"; - final static String TIFF_EXT = ".tif"; - final static String TIFF_FORMAT = "tiff"; - final static String JAI_IMAGE_WRITER_MESSAGE = "Need to install JAI Image I/O package.\nhttps://java.net/projects/jai-imageio/"; - final static String JAI_IMAGE_READER_MESSAGE = "Unsupported image format. May need to install JAI Image I/O package.\nhttps://java.net/projects/jai-imageio/"; - - /** - * Creates a list of TIFF image files from an image file. It basically - * converts images of other formats to TIFF format, or a multi-page TIFF - * image to multiple TIFF image files. - * - * @param imageFile input image file - * @param index an index of the page; -1 means all pages, as in a multi-page - * TIFF image - * @return a list of TIFF image files - * @throws IOException - */ - public static List createTiffFiles(File imageFile, int index) throws IOException { - return createTiffFiles(imageFile, index, false); - } - - /** - * Creates a list of TIFF image files from an image file. It basically - * converts images of other formats to TIFF format, or a multi-page TIFF - * image to multiple TIFF image files. - * - * @param imageFile input image file - * @param index an index of the page; -1 means all pages, as in a multi-page - * TIFF image - * @param preserve preserve compression mode - * @return a list of TIFF image files - * @throws IOException - */ - public static List createTiffFiles(File imageFile, int index, boolean preserve) throws IOException { - List tiffFiles = new ArrayList(); - - String imageFileName = imageFile.getName(); - String imageFormat = imageFileName.substring(imageFileName.lastIndexOf('.') + 1); - - Iterator readers = ImageIO.getImageReadersByFormatName(imageFormat); - - if (!readers.hasNext()) { - throw new RuntimeException(JAI_IMAGE_READER_MESSAGE); - } - - ImageReader reader = readers.next(); - - ImageInputStream iis = ImageIO.createImageInputStream(imageFile); - reader.setInput(iis); - //Read the stream metadata -// IIOMetadata streamMetadata = reader.getStreamMetadata(); - - //Set up the writeParam - TIFFImageWriteParam tiffWriteParam = new TIFFImageWriteParam(Locale.US); - - if (!preserve) { - tiffWriteParam.setCompressionMode(ImageWriteParam.MODE_DISABLED); // not preserve original sizes; decompress - } - - //Get tif writer and set output to file - Iterator writers = ImageIO.getImageWritersByFormatName(TIFF_FORMAT); - - if (!writers.hasNext()) { - throw new RuntimeException(JAI_IMAGE_WRITER_MESSAGE); - } - - ImageWriter writer = writers.next(); - - //Read the stream metadata - IIOMetadata streamMetadata = writer.getDefaultStreamMetadata(tiffWriteParam); - - int imageTotal = reader.getNumImages(true); - - for (int i = 0; i < imageTotal; i++) { - // all if index == -1; otherwise, only index-th - if (index == -1 || i == index) { -// BufferedImage bi = reader.read(i); -// IIOImage oimage = new IIOImage(bi, null, reader.getImageMetadata(i)); - IIOImage oimage = reader.readAll(i, reader.getDefaultReadParam()); - File tiffFile = File.createTempFile(OUTPUT_FILE_NAME, TIFF_EXT); - ImageOutputStream ios = ImageIO.createImageOutputStream(tiffFile); - writer.setOutput(ios); - writer.write(streamMetadata, oimage, tiffWriteParam); - ios.close(); - tiffFiles.add(tiffFile); - } - } - writer.dispose(); - reader.dispose(); - - return tiffFiles; - } - - /** - * Creates a list of TIFF image files from a list of IIOImage - * objects. - * - * @param imageList a list of IIOImage objects - * @param index an index of the page; -1 means all pages - * @return a list of TIFF image files - * @throws IOException - */ - public static List createTiffFiles(List imageList, int index) throws IOException { - return createTiffFiles(imageList, index, 0, 0); - } - - public static List createTiffFiles(List imageList, int index, int dpiX, int dpiY) throws IOException { - List tiffFiles = new ArrayList(); - - //Set up the writeParam - TIFFImageWriteParam tiffWriteParam = new TIFFImageWriteParam(Locale.US); - tiffWriteParam.setCompressionMode(ImageWriteParam.MODE_DISABLED); - - //Get tif writer and set output to file - Iterator writers = ImageIO.getImageWritersByFormatName(TIFF_FORMAT); - - if (!writers.hasNext()) { - throw new RuntimeException(JAI_IMAGE_WRITER_MESSAGE); - } - - ImageWriter writer = writers.next(); - - //Get the stream metadata - IIOMetadata streamMetadata = writer.getDefaultStreamMetadata(tiffWriteParam); - - // all if index == -1; otherwise, only index-th - for (IIOImage oimage : (index == -1 ? imageList : imageList.subList(index, index + 1))) { - if (dpiX != 0 && dpiY != 0) { - // Get the default image metadata. - ImageTypeSpecifier imageType = ImageTypeSpecifier.createFromRenderedImage(oimage.getRenderedImage()); - IIOMetadata imageMetadata = writer.getDefaultImageMetadata(imageType, null); - imageMetadata = setDPIViaAPI(imageMetadata, dpiX, dpiY); - oimage.setMetadata(imageMetadata); - } - - File tiffFile = File.createTempFile(OUTPUT_FILE_NAME, TIFF_EXT); - ImageOutputStream ios = ImageIO.createImageOutputStream(tiffFile); - writer.setOutput(ios); - writer.write(streamMetadata, oimage, tiffWriteParam); - ios.close(); - tiffFiles.add(tiffFile); - } - writer.dispose(); - - return tiffFiles; - } - - /** - * Set DPI using API. - * - * @param imageMetadata original IIOMetadata - * @param dpiX horizontal resolution - * @param dpiY vertical resolution - * @return modified IIOMetadata - * @throws IIOInvalidTreeException - */ - private static IIOMetadata setDPIViaAPI(IIOMetadata imageMetadata, int dpiX, int dpiY) - throws IIOInvalidTreeException { - // Derive the TIFFDirectory from the metadata. - TIFFDirectory dir = TIFFDirectory.createFromMetadata(imageMetadata); - - // Get {X,Y}Resolution tags. - BaselineTIFFTagSet base = BaselineTIFFTagSet.getInstance(); - TIFFTag tagXRes = base.getTag(BaselineTIFFTagSet.TAG_X_RESOLUTION); - TIFFTag tagYRes = base.getTag(BaselineTIFFTagSet.TAG_Y_RESOLUTION); - - // Create {X,Y}Resolution fields. - TIFFField fieldXRes = new TIFFField(tagXRes, TIFFTag.TIFF_RATIONAL, - 1, new long[][]{{dpiX, 1}}); - TIFFField fieldYRes = new TIFFField(tagYRes, TIFFTag.TIFF_RATIONAL, - 1, new long[][]{{dpiY, 1}}); - - // Append {X,Y}Resolution fields to directory. - dir.addTIFFField(fieldXRes); - dir.addTIFFField(fieldYRes); - - // Convert to metadata object. - IIOMetadata metadata = dir.getAsMetadata(); - - // Add other metadata. - IIOMetadataNode root = new IIOMetadataNode("javax_imageio_1.0"); - IIOMetadataNode horiz = new IIOMetadataNode("HorizontalPixelSize"); - horiz.setAttribute("value", Double.toString(25.4f / dpiX)); - IIOMetadataNode vert = new IIOMetadataNode("VerticalPixelSize"); - vert.setAttribute("value", Double.toString(25.4f / dpiY)); - IIOMetadataNode dim = new IIOMetadataNode("Dimension"); - dim.appendChild(horiz); - dim.appendChild(vert); - root.appendChild(dim); - metadata.mergeTree("javax_imageio_1.0", root); - - return metadata; - } - - /** - * Gets pixel data of an IIOImage object. - * - * @param image an IIOImage object - * @return a byte buffer of pixel data - * @throws IOException - */ - public static ByteBuffer getImageByteBuffer(IIOImage image) throws IOException { - return getImageByteBuffer(image.getRenderedImage()); - } - - /** - * Gets pixel data of an RenderedImage object. - * - * @param image an RenderedImage object - * @return a byte buffer of pixel data - * @throws IOException - */ - public static ByteBuffer getImageByteBuffer(RenderedImage image) throws IOException { - //Set up the writeParam - TIFFImageWriteParam tiffWriteParam = new TIFFImageWriteParam(Locale.US); - tiffWriteParam.setCompressionMode(ImageWriteParam.MODE_DISABLED); - - //Get tif writer and set output to file - Iterator writers = ImageIO.getImageWritersByFormatName(TIFF_FORMAT); - - if (!writers.hasNext()) { - throw new RuntimeException(JAI_IMAGE_WRITER_MESSAGE); - } - - ImageWriter writer = writers.next(); - - //Get the stream metadata - IIOMetadata streamMetadata = writer.getDefaultStreamMetadata(tiffWriteParam); - - ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); - ImageOutputStream ios = ImageIO.createImageOutputStream(outputStream); - writer.setOutput(ios); - writer.write(streamMetadata, new IIOImage(image, null, null), tiffWriteParam); -// writer.write(image); - writer.dispose(); -// ImageIO.write(image, "tiff", ios); // this can be used in lieu of writer - ios.seek(0); - BufferedImage bi = ImageIO.read(ios); - return convertImageData(bi); - } - - /** - * Converts BufferedImage to ByteBuffer. - * - * @param bi Input image - * @return pixel data - */ - public static ByteBuffer convertImageData(BufferedImage bi) { - DataBuffer buff = bi.getRaster().getDataBuffer(); - // ClassCastException thrown if buff not instanceof DataBufferByte because raster data is not necessarily bytes. - // Convert the original buffered image to grayscale. - if (!(buff instanceof DataBufferByte)) { - bi = ImageHelper.convertImageToGrayscale(bi); - buff = bi.getRaster().getDataBuffer(); - } - byte[] pixelData = ((DataBufferByte) buff).getData(); - // return ByteBuffer.wrap(pixelData); - ByteBuffer buf = ByteBuffer.allocateDirect(pixelData.length); - buf.order(ByteOrder.nativeOrder()); - buf.put(pixelData); - buf.flip(); - return buf; - } - - /** - * Gets a list of BufferedImage objects for an image file. - * - * @param imageFile input image file. It can be any of the supported - * formats, including TIFF, JPEG, GIF, PNG, BMP, JPEG - * @return a list of BufferedImage objects - * @throws IOException - */ - public static List getImageList(File imageFile) throws IOException { - ImageReader reader = null; - ImageInputStream iis = null; - - try { - List biList = new ArrayList(); - - String imageFileName = imageFile.getName(); - String imageFormat = imageFileName.substring(imageFileName.lastIndexOf('.') + 1); - Iterator readers = ImageIO.getImageReadersByFormatName(imageFormat); - if (!readers.hasNext()) { - throw new RuntimeException(JAI_IMAGE_READER_MESSAGE); - } - - reader = readers.next(); - - iis = ImageIO.createImageInputStream(imageFile); - reader.setInput(iis); - - int imageTotal = reader.getNumImages(true); - - for (int i = 0; i < imageTotal; i++) { - BufferedImage bi = reader.read(i); - biList.add(bi); - } - - return biList; - } finally { - try { - if (iis != null) { - iis.close(); - } - if (reader != null) { - reader.dispose(); - } - } catch (Exception e) { - // ignore - } - } - } - - /** - * Gets a list of IIOImage objects for an image file. - * - * @param imageFile input image file. It can be any of the supported - * formats, including TIFF, JPEG, GIF, PNG, BMP, JPEG, and PDF if GPL - * Ghostscript is installed - * @return a list of IIOImage objects - * @throws IOException - */ - public static List getIIOImageList(File imageFile) throws IOException { - File workingTiffFile = null; - - ImageReader reader = null; - ImageInputStream iis = null; - - try { - // convert PDF to TIFF - if (imageFile.getName().toLowerCase().endsWith(".pdf")) { - workingTiffFile = PdfUtilities.convertPdf2Tiff(imageFile); - imageFile = workingTiffFile; - } - - List iioImageList = new ArrayList(); - - String imageFileName = imageFile.getName(); - String imageFormat = imageFileName.substring(imageFileName.lastIndexOf('.') + 1); - if (imageFormat.matches("(pbm|pgm|ppm)")) { - imageFormat = "pnm"; - } else if (imageFormat.matches("(jp2|j2k|jpf|jpx|jpm)")) { - imageFormat = "jpeg2000"; - } - Iterator readers = ImageIO.getImageReadersByFormatName(imageFormat); - - if (!readers.hasNext()) { - throw new RuntimeException(JAI_IMAGE_READER_MESSAGE); - } - - reader = readers.next(); - iis = ImageIO.createImageInputStream(imageFile); - reader.setInput(iis); - - int imageTotal = reader.getNumImages(true); - - for (int i = 0; i < imageTotal; i++) { -// IIOImage oimage = new IIOImage(reader.read(i), null, reader.getImageMetadata(i)); - IIOImage oimage = reader.readAll(i, reader.getDefaultReadParam()); - iioImageList.add(oimage); - } - - return iioImageList; - } finally { - try { - if (iis != null) { - iis.close(); - } - if (reader != null) { - reader.dispose(); - } - } catch (Exception e) { - // ignore - } - if (workingTiffFile != null && workingTiffFile.exists()) { - workingTiffFile.delete(); - } - } - } - - /** - * Gets a list of IIOImage objects for a - * BufferedImage. - * - * @param bi input image - * @return a list of IIOImage objects - * @throws IOException - */ - public static List getIIOImageList(BufferedImage bi) throws IOException { - List iioImageList = new ArrayList(); - IIOImage oimage = new IIOImage(bi, null, null); - iioImageList.add(oimage); - return iioImageList; - } - - /** - * Merges multiple images into one multi-page TIFF image. - * - * @param inputImages an array of image files - * @param outputTiff the output multi-page TIFF file - * @throws IOException - */ - public static void mergeTiff(File[] inputImages, File outputTiff) throws IOException { - if (inputImages.length == 0) { - // if no image - return; - } - - Iterator writers = ImageIO.getImageWritersByFormatName(TIFF_FORMAT); - - if (!writers.hasNext()) { - throw new RuntimeException(JAI_IMAGE_WRITER_MESSAGE); - } - - ImageWriter writer = writers.next(); - - //Set up the writeParam - TIFFImageWriteParam tiffWriteParam = new TIFFImageWriteParam(Locale.US); -// tiffWriteParam.setCompressionMode(ImageWriteParam.MODE_DISABLED); // commented out to preserve original sizes - - //Get the stream metadata - IIOMetadata streamMetadata = writer.getDefaultStreamMetadata(tiffWriteParam); - - ImageOutputStream ios = ImageIO.createImageOutputStream(outputTiff); - writer.setOutput(ios); - - boolean firstPage = true; - int index = 1; - for (File inputImage : inputImages) { - List iioImages = getIIOImageList(inputImage); - for (IIOImage iioImage : iioImages) { - if (firstPage) { - writer.write(streamMetadata, iioImage, tiffWriteParam); - firstPage = false; - } else { - writer.writeInsert(index++, iioImage, tiffWriteParam); - } - } - } - - ios.close(); - - writer.dispose(); - } - - /** - * Merges multiple images into one multi-page TIFF image. - * - * @param inputImages an array of BufferedImage - * @param outputTiff the output TIFF file - * @throws IOException - */ - public static void mergeTiff(BufferedImage[] inputImages, File outputTiff) throws IOException { - mergeTiff(inputImages, outputTiff, null); - } - - /** - * Merges multiple images into one multi-page TIFF image. - * - * @param inputImages an array of BufferedImage - * @param outputTiff the output TIFF file - * @param compressionType valid values: LZW, CCITT T.6, PackBits - * @throws IOException - */ - public static void mergeTiff(BufferedImage[] inputImages, File outputTiff, String compressionType) throws IOException { - List imageList = new ArrayList(); - - for (BufferedImage inputImage : inputImages) { - imageList.add(new IIOImage(inputImage, null, null)); - } - - mergeTiff(imageList, outputTiff, compressionType); - } - - /** - * Merges multiple images into one multi-page TIFF image. - * - * @param imageList a list of IIOImage objects - * @param outputTiff the output TIFF file - * @throws IOException - */ - public static void mergeTiff(List imageList, File outputTiff) throws IOException { - mergeTiff(imageList, outputTiff, null); - } - - /** - * Merges multiple images into one multi-page TIFF image. - * - * @param imageList a list of IIOImage objects - * @param outputTiff the output TIFF file - * @param compressionType valid values: LZW, CCITT T.6, PackBits - * @throws IOException - */ - public static void mergeTiff(List imageList, File outputTiff, String compressionType) throws IOException { - if (imageList == null || imageList.isEmpty()) { - // if no image - return; - } - - Iterator writers = ImageIO.getImageWritersByFormatName(TIFF_FORMAT); - if (!writers.hasNext()) { - throw new RuntimeException(JAI_IMAGE_WRITER_MESSAGE); - } - - ImageWriter writer = writers.next(); - - //Set up the writeParam - TIFFImageWriteParam tiffWriteParam = new TIFFImageWriteParam(Locale.US); -// tiffWriteParam.setCompressionMode(ImageWriteParam.MODE_DISABLED); // comment out to preserve original sizes - if (compressionType != null) { - tiffWriteParam.setCompressionMode(ImageWriteParam.MODE_EXPLICIT); - tiffWriteParam.setCompressionType(compressionType); - } - - //Get the stream metadata - IIOMetadata streamMetadata = writer.getDefaultStreamMetadata(tiffWriteParam); - - ImageOutputStream ios = ImageIO.createImageOutputStream(outputTiff); - writer.setOutput(ios); - - int dpiX = 300; - int dpiY = 300; - - for (IIOImage iioImage : imageList) { - // Get the default image metadata. - ImageTypeSpecifier imageType = ImageTypeSpecifier.createFromRenderedImage(iioImage.getRenderedImage()); - IIOMetadata imageMetadata = writer.getDefaultImageMetadata(imageType, null); - imageMetadata = setDPIViaAPI(imageMetadata, dpiX, dpiY); - iioImage.setMetadata(imageMetadata); - } - - IIOImage firstIioImage = imageList.remove(0); - writer.write(streamMetadata, firstIioImage, tiffWriteParam); - - int i = 1; - for (IIOImage iioImage : imageList) { - writer.writeInsert(i++, iioImage, tiffWriteParam); - } - ios.close(); - - writer.dispose(); - } - - /** - * Deskews image. - * - * @param imageFile input image - * @param minimumDeskewThreshold minimum deskew threshold (typically, 0.05d) - * @return temporary multi-page TIFF image file - * @throws IOException - */ - public static File deskewImage(File imageFile, double minimumDeskewThreshold) throws IOException { - List imageList = getImageList(imageFile); - for (int i = 0; i < imageList.size(); i++) { - BufferedImage bi = imageList.get(i); - ImageDeskew deskew = new ImageDeskew(bi); - double imageSkewAngle = deskew.getSkewAngle(); - - if ((imageSkewAngle > minimumDeskewThreshold || imageSkewAngle < -(minimumDeskewThreshold))) { - bi = ImageUtil.rotate(bi, -imageSkewAngle, bi.getWidth() / 2, bi.getHeight() / 2); - imageList.set(i, bi); // replace original with deskewed image - } - } - - File tempImageFile = File.createTempFile(FilenameUtils.getBaseName(imageFile.getName()), ".tif"); - ImageIOHelper.mergeTiff(imageList.toArray(new BufferedImage[0]), tempImageFile); - - return tempImageFile; - } - - /** - * Reads image meta data. - * - * @param oimage - * @return a map of meta data - */ - public static Map readImageData(IIOImage oimage) { - Map dict = new HashMap(); - - IIOMetadata imageMetadata = oimage.getMetadata(); - if (imageMetadata != null) { - IIOMetadataNode dimNode = (IIOMetadataNode) imageMetadata.getAsTree("javax_imageio_1.0"); - NodeList nodes = dimNode.getElementsByTagName("HorizontalPixelSize"); - int dpiX; - if (nodes.getLength() > 0) { - float dpcWidth = Float.parseFloat(nodes.item(0).getAttributes().item(0).getNodeValue()); - dpiX = (int) Math.round(25.4f / dpcWidth); - } else { - dpiX = Toolkit.getDefaultToolkit().getScreenResolution(); - } - dict.put("dpiX", String.valueOf(dpiX)); - - nodes = dimNode.getElementsByTagName("VerticalPixelSize"); - int dpiY; - if (nodes.getLength() > 0) { - float dpcHeight = Float.parseFloat(nodes.item(0).getAttributes().item(0).getNodeValue()); - dpiY = (int) Math.round(25.4f / dpcHeight); - } else { - dpiY = Toolkit.getDefaultToolkit().getScreenResolution(); - } - dict.put("dpiY", String.valueOf(dpiY)); - } - - return dict; - } -} diff --git a/Tess4J/src/net/sourceforge/tess4j/util/LoadLibs.java b/Tess4J/src/net/sourceforge/tess4j/util/LoadLibs.java deleted file mode 100644 index 1d7ec6a..0000000 --- a/Tess4J/src/net/sourceforge/tess4j/util/LoadLibs.java +++ /dev/null @@ -1,232 +0,0 @@ -/** - * Copyright @ 2014 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package net.sourceforge.tess4j.util; - -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.net.JarURLConnection; -import java.net.URISyntaxException; -import java.net.URL; -import java.net.URLConnection; -import java.util.Enumeration; -import java.util.jar.JarEntry; -import java.util.jar.JarFile; - -import org.apache.commons.io.FileUtils; -import org.apache.commons.io.IOUtils; -import org.jboss.vfs.VFS; -import org.jboss.vfs.VirtualFile; -import org.slf4j.LoggerFactory; - -import com.sun.jna.Native; -import com.sun.jna.Platform; - -import net.sourceforge.tess4j.TessAPI; - -/** - * Loads native libraries from JAR or project folder. - * - * @author O.J. Sousa Rodrigues - * @author Quan Nguyen - */ -public class LoadLibs { - - private static final String VFS_PROTOCOL = "vfs"; - private static final String JNA_LIBRARY_PATH = "jna.library.path"; - public static final String TESS4J_TEMP_DIR = new File(System.getProperty("java.io.tmpdir"), "tess4j").getPath(); - - /** - * Native library name. - */ - public static final String LIB_NAME = "libtesseract3051"; - public static final String LIB_NAME_NON_WIN = "tesseract"; - - private static final org.slf4j.Logger logger = LoggerFactory.getLogger(new LoggHelper().toString()); - - static { - System.setProperty("jna.encoding", "UTF8"); - File targetTempFolder = extractTessResources(Platform.RESOURCE_PREFIX); - if (targetTempFolder != null && targetTempFolder.exists()) { - String userCustomizedPath = System.getProperty(JNA_LIBRARY_PATH); - if (null == userCustomizedPath || userCustomizedPath.isEmpty()) { - System.setProperty(JNA_LIBRARY_PATH, targetTempFolder.getPath()); - } else { - System.setProperty(JNA_LIBRARY_PATH, userCustomizedPath + File.pathSeparator + targetTempFolder.getPath()); - } - } - } - - /** - * Loads Tesseract library via JNA. - * - * @return TessAPI instance being loaded using - * Native.loadLibrary(). - */ - public static TessAPI getTessAPIInstance() { - return (TessAPI) Native.loadLibrary(getTesseractLibName(), TessAPI.class); - } - - /** - * Gets native library name. - * - * @return the name of the tesseract library to be loaded using the - * Native.register(). - */ - public static String getTesseractLibName() { - return Platform.isWindows() ? LIB_NAME : LIB_NAME_NON_WIN; - } - - /** - * Extracts tesseract resources to temp folder. - * - * @param resourceName name of file or directory - * @return target path, which could be file or directory - */ - public static synchronized File extractTessResources(String resourceName) { - File targetPath = null; - - try { - targetPath = new File(TESS4J_TEMP_DIR, resourceName); - - Enumeration resources = LoadLibs.class.getClassLoader().getResources(resourceName); - while (resources.hasMoreElements()) { - URL resourceUrl = resources.nextElement(); - copyResources(resourceUrl, targetPath); - } - } catch (IOException | URISyntaxException e) { - logger.warn(e.getMessage(), e); - } - - return targetPath; - } - - /** - * Copies resources to target folder. - * - * @param resourceUrl - * @param targetPath - * @return - */ - static void copyResources(URL resourceUrl, File targetPath) throws IOException, URISyntaxException { - if (resourceUrl == null) { - return; - } - - URLConnection urlConnection = resourceUrl.openConnection(); - - /** - * Copy resources either from inside jar or from project folder. - */ - if (urlConnection instanceof JarURLConnection) { - copyJarResourceToPath((JarURLConnection) urlConnection, targetPath); - } else if (VFS_PROTOCOL.equals(resourceUrl.getProtocol())) { - VirtualFile virtualFileOrFolder = VFS.getChild(resourceUrl.toURI()); - copyFromWarToFolder(virtualFileOrFolder, targetPath); - } else { - File file = new File(resourceUrl.getPath()); - if (file.isDirectory()) { - for (File resourceFile : FileUtils.listFiles(file, null, true)) { - int index = resourceFile.getPath().lastIndexOf(targetPath.getName()) + targetPath.getName().length(); - File targetFile = new File(targetPath, resourceFile.getPath().substring(index)); - if (!targetFile.exists() || targetFile.length() != resourceFile.length()) { - if (resourceFile.isFile()) { - FileUtils.copyFile(resourceFile, targetFile); - } - } - } - } else { - if (!targetPath.exists() || targetPath.length() != file.length()) { - FileUtils.copyFile(file, targetPath); - } - } - } - } - - /** - * Copies resources from the jar file of the current thread and extract it - * to the destination path. - * - * @param jarConnection - * @param destPath destination file or directory - */ - static void copyJarResourceToPath(JarURLConnection jarConnection, File destPath) { - try (JarFile jarFile = jarConnection.getJarFile()) { - String jarConnectionEntryName = jarConnection.getEntryName(); - if (!jarConnectionEntryName.endsWith("/")) { - jarConnectionEntryName += "/"; - } - - /** - * Iterate all entries in the jar file. - */ - for (Enumeration e = jarFile.entries(); e.hasMoreElements();) { - JarEntry jarEntry = e.nextElement(); - String jarEntryName = jarEntry.getName(); - - /** - * Extract files only if they match the path. - */ - if (jarEntryName.startsWith(jarConnectionEntryName)) { - String filename = jarEntryName.substring(jarConnectionEntryName.length()); - File targetFile = new File(destPath, filename); - - if (jarEntry.isDirectory()) { - targetFile.mkdirs(); - } else { - if (!targetFile.exists() || targetFile.length() != jarEntry.getSize()) { - try (InputStream is = jarFile.getInputStream(jarEntry); - OutputStream out = FileUtils.openOutputStream(targetFile)) { - IOUtils.copy(is, out); - } - } - } - } - } - } catch (IOException e) { - logger.warn(e.getMessage(), e); - } - } - - /** - * Copies resources from WAR to target folder. - * - * @param virtualFileOrFolder - * @param targetFolder - * @throws IOException - */ - static void copyFromWarToFolder(VirtualFile virtualFileOrFolder, File targetFolder) throws IOException { - if (virtualFileOrFolder.isDirectory() && !virtualFileOrFolder.getName().contains(".")) { - if (targetFolder.getName().equalsIgnoreCase(virtualFileOrFolder.getName())) { - for (VirtualFile innerFileOrFolder : virtualFileOrFolder.getChildren()) { - copyFromWarToFolder(innerFileOrFolder, targetFolder); - } - } else { - File innerTargetFolder = new File(targetFolder, virtualFileOrFolder.getName()); - innerTargetFolder.mkdir(); - for (VirtualFile innerFileOrFolder : virtualFileOrFolder.getChildren()) { - copyFromWarToFolder(innerFileOrFolder, innerTargetFolder); - } - } - } else { - File targetFile = new File(targetFolder, virtualFileOrFolder.getName()); - if (!targetFile.exists() || targetFile.length() != virtualFileOrFolder.getSize()) { - FileUtils.copyURLToFile(virtualFileOrFolder.asFileURL(), targetFile); - } - } - } -} diff --git a/Tess4J/src/net/sourceforge/tess4j/util/LoggHelper.java b/Tess4J/src/net/sourceforge/tess4j/util/LoggHelper.java deleted file mode 100644 index 299ee61..0000000 --- a/Tess4J/src/net/sourceforge/tess4j/util/LoggHelper.java +++ /dev/null @@ -1,34 +0,0 @@ -/** - * Copyright @ 2015 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package net.sourceforge.tess4j.util; - -/** - * Helper for logging. - * - * @author O.J. Sousa Rodrigues - */ -public class LoggHelper extends Exception { - - @Override - public String toString() { - LoggerConfig.INSTANCE.loadConfig(); - - StackTraceElement[] sTrace = this.getStackTrace(); - String className = sTrace[0].getClassName(); - - return className; - } -} diff --git a/Tess4J/src/net/sourceforge/tess4j/util/LoggerConfig.java b/Tess4J/src/net/sourceforge/tess4j/util/LoggerConfig.java deleted file mode 100644 index c54661a..0000000 --- a/Tess4J/src/net/sourceforge/tess4j/util/LoggerConfig.java +++ /dev/null @@ -1,51 +0,0 @@ -/** - * Copyright @ 2015 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package net.sourceforge.tess4j.util; - -import org.slf4j.bridge.SLF4JBridgeHandler; - -/** - * Logging configuration. - * - * @author O.J. Sousa Rodrigues - */ -public enum LoggerConfig { - - INSTANCE; - - private boolean isLoaded = false; - - /** - * This method loads the Logger configuration. - * - * @return true if the Logger configuration was loaded successfully. - */ - public boolean loadConfig() { - - try { - if (!isLoaded) { - SLF4JBridgeHandler.removeHandlersForRootLogger(); - SLF4JBridgeHandler.install(); - this.isLoaded = true; -// System.out.println("Logger configuration was loaded successfully."); - } - } catch (final Exception e) { - System.err.println("Logger configuration could not be loaded."); - } - - return this.isLoaded; - } -} diff --git a/Tess4J/src/net/sourceforge/tess4j/util/PdfBoxUtilities.java b/Tess4J/src/net/sourceforge/tess4j/util/PdfBoxUtilities.java deleted file mode 100644 index 869b727..0000000 --- a/Tess4J/src/net/sourceforge/tess4j/util/PdfBoxUtilities.java +++ /dev/null @@ -1,215 +0,0 @@ -/** - * Copyright @ 2018 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package net.sourceforge.tess4j.util; - -import java.awt.image.BufferedImage; -import java.io.File; -import java.io.FilenameFilter; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; -import java.util.Comparator; -import java.util.List; - -import org.apache.pdfbox.io.MemoryUsageSetting; -import org.apache.pdfbox.multipdf.PDFMergerUtility; -import org.apache.pdfbox.multipdf.Splitter; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.rendering.ImageType; -import org.apache.pdfbox.rendering.PDFRenderer; -import org.apache.pdfbox.tools.imageio.ImageIOUtil; -import org.slf4j.LoggerFactory; - -/** - * PDF utilities based on PDFBox. - * - * @author Robert Drysdale - * @author Quan Nguyen - */ -public class PdfBoxUtilities { - - private static final org.slf4j.Logger logger = LoggerFactory.getLogger(new LoggHelper().toString()); - - /** - * Converts PDF to TIFF format. - * - * @param inputPdfFile input file - * @return a multi-page TIFF image - * @throws IOException - */ - public static File convertPdf2Tiff(File inputPdfFile) throws IOException { - File[] pngFiles = null; - - try { - pngFiles = convertPdf2Png(inputPdfFile); - File tiffFile = File.createTempFile("multipage", ".tif"); - - // put PNG images into a single multi-page TIFF image for return - ImageIOHelper.mergeTiff(pngFiles, tiffFile); - return tiffFile; - } finally { - if (pngFiles != null && pngFiles.length > 0) { - // get the working directory of the PNG files - File pngDirectory = new File(pngFiles[0].getParent()); - // delete temporary PNG images - for (File tempFile : pngFiles) { - tempFile.delete(); - } - - pngDirectory.delete(); - } - } - } - - /** - * Converts PDF to PNG format. - * - * @param inputPdfFile input file - * @return an array of PNG images - * @throws java.io.IOException - */ - public static File[] convertPdf2Png(File inputPdfFile) throws IOException { - Path path = Files.createTempDirectory("tessimages"); - File imageDir = path.toFile(); - - PDDocument document = null; - try { - document = PDDocument.load(inputPdfFile); - PDFRenderer pdfRenderer = new PDFRenderer(document); - for (int page = 0; page < document.getNumberOfPages(); ++page) { - BufferedImage bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB); - - // suffix in filename will be used as the file format - String filename = String.format("workingimage%04d.png", page + 1); - ImageIOUtil.writeImage(bim, new File(imageDir, filename).getAbsolutePath(), 300); - } - } catch (IOException ioe) { - logger.error("Error extracting PDF Document => " + ioe); - } finally { - if (imageDir.list().length == 0) { - imageDir.delete(); - } - - if (document != null) { - try { - document.close(); - } catch (Exception e) { - } - } - } - - // find working files - File[] workingFiles = imageDir.listFiles(new FilenameFilter() { - - @Override - public boolean accept(File dir, String name) { - return name.toLowerCase().matches("workingimage\\d{4}\\.png$"); - } - }); - - Arrays.sort(workingFiles, new Comparator() { - @Override - public int compare(File f1, File f2) { - return f1.getName().compareTo(f2.getName()); - } - }); - - return workingFiles; - } - - /** - * Splits PDF. - * - * @param inputPdfFile input file - * @param outputPdfFile output file - * @param firstPage begin page - * @param lastPage end page - */ - public static void splitPdf(File inputPdfFile, File outputPdfFile, int firstPage, int lastPage) { - PDDocument document = null; - try { - document = PDDocument.load(inputPdfFile); - Splitter splitter = new Splitter(); - - splitter.setStartPage(firstPage); - splitter.setEndPage(lastPage); - splitter.setSplitAtPage(lastPage - firstPage + 1); - - List documents = splitter.split(document); - - if (documents.size() == 1) { - PDDocument outputPdf = documents.get(0); - outputPdf.save(outputPdfFile); - outputPdf.close(); - } else { - logger.error("Splitter returned " + documents.size() + " documents rather than expected of 1"); - } - } catch (IOException ioe) { - logger.error("Exception splitting PDF => " + ioe); - } finally { - if (document != null) { - try { - document.close(); - } catch (Exception e) { - } - } - } - } - - /** - * Gets PDF Page Count. - * - * @param inputPdfFile input file - * @return number of pages - */ - public static int getPdfPageCount(File inputPdfFile) { - PDDocument document = null; - try { - document = PDDocument.load(inputPdfFile); - return document.getNumberOfPages(); - } catch (IOException ioe) { - logger.error("Error counting PDF pages => " + ioe); - return - 1; - } finally { - if (document != null) { - try { - document.close(); - } catch (Exception e) { - } - } - } - } - - /** - * Merges PDF files. - * - * @param inputPdfFiles array of input files - * @param outputPdfFile output file - */ - public static void mergePdf(File[] inputPdfFiles, File outputPdfFile) { - try { - PDFMergerUtility mergerUtility = new PDFMergerUtility(); - mergerUtility.setDestinationFileName(outputPdfFile.getPath()); - for (File inputPdfFile : inputPdfFiles) { - mergerUtility.addSource(inputPdfFile); - } - mergerUtility.mergeDocuments(MemoryUsageSetting.setupMainMemoryOnly()); - } catch (IOException ioe) { - logger.error("Error counting PDF pages => " + ioe); - } - } -} diff --git a/Tess4J/src/net/sourceforge/tess4j/util/PdfGsUtilities.java b/Tess4J/src/net/sourceforge/tess4j/util/PdfGsUtilities.java deleted file mode 100644 index 8f94cbe..0000000 --- a/Tess4J/src/net/sourceforge/tess4j/util/PdfGsUtilities.java +++ /dev/null @@ -1,319 +0,0 @@ -/** - * Copyright @ 2009 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package net.sourceforge.tess4j.util; - -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.FilenameFilter; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Comparator; -import java.util.List; - -import org.ghost4j.Ghostscript; -import org.ghost4j.GhostscriptException; -import org.slf4j.LoggerFactory; - -/** - * PDF utilities based on Ghostscript. - */ -public class PdfGsUtilities { - - public static final String GS_INSTALL = "\nPlease download, install GPL Ghostscript from http://www.ghostscript.com\nand/or set the appropriate path variable."; - - private static final org.slf4j.Logger logger = LoggerFactory.getLogger(new LoggHelper().toString()); - - /** - * Converts PDF to TIFF format. - * - * @param inputPdfFile input file - * @return a multi-page TIFF image - * @throws IOException - */ - public static File convertPdf2Tiff(File inputPdfFile) throws IOException { - File[] pngFiles = null; - - try { - pngFiles = convertPdf2Png(inputPdfFile); - File tiffFile = File.createTempFile("multipage", ".tif"); - - // put PNG images into a single multi-page TIFF image for return - ImageIOHelper.mergeTiff(pngFiles, tiffFile); - return tiffFile; - } finally { - if (pngFiles != null && pngFiles.length > 0) { - // get the working directory of the PNG files - File pngDirectory = new File(pngFiles[0].getParent()); - // delete temporary PNG images - for (File tempFile : pngFiles) { - tempFile.delete(); - } - - pngDirectory.delete(); - } - } - } - - /** - * Converts PDF to PNG format. - * - * @param inputPdfFile input file - * @return an array of PNG images - * @throws java.io.IOException - */ - public synchronized static File[] convertPdf2Png(File inputPdfFile) throws IOException { - Path path = Files.createTempDirectory("tessimages"); - File imageDir = path.toFile(); - - //get Ghostscript instance - Ghostscript gs = Ghostscript.getInstance(); - - //prepare Ghostscript interpreter parameters - //refer to Ghostscript documentation for parameter usage - List gsArgs = new ArrayList(); - gsArgs.add("-gs"); - gsArgs.add("-dNOPAUSE"); - gsArgs.add("-dQUIET"); - gsArgs.add("-dBATCH"); - gsArgs.add("-dSAFER"); - gsArgs.add("-sDEVICE=pnggray"); - gsArgs.add("-r300"); - gsArgs.add("-dGraphicsAlphaBits=4"); - gsArgs.add("-dTextAlphaBits=4"); - gsArgs.add("-sOutputFile=" + imageDir.getPath() + "/workingimage%04d.png"); - gsArgs.add(inputPdfFile.getPath()); - - //execute and exit interpreter - try { - synchronized (gs) { - gs.initialize(gsArgs.toArray(new String[0])); - gs.exit(); - } - } catch (UnsatisfiedLinkError e) { - logger.error(e.getMessage()); - throw new RuntimeException(getMessage(e.getMessage())); - } catch (NoClassDefFoundError e) { - logger.error(e.getMessage()); - throw new RuntimeException(getMessage(e.getMessage())); - } catch (GhostscriptException e) { - logger.error(e.getMessage()); - throw new RuntimeException(e.getMessage()); - } finally { - if (imageDir.list().length == 0) { - imageDir.delete(); - } - - //delete interpreter instance (safer) - try { - Ghostscript.deleteInstance(); - } catch (GhostscriptException e) { - //nothing - } - } - - // find working files - File[] workingFiles = imageDir.listFiles(new FilenameFilter() { - - @Override - public boolean accept(File dir, String name) { - return name.toLowerCase().matches("workingimage\\d{4}\\.png$"); - } - }); - - Arrays.sort(workingFiles, new Comparator() { - @Override - public int compare(File f1, File f2) { - return f1.getName().compareTo(f2.getName()); - } - }); - - return workingFiles; - } - - /** - * Splits PDF. - * - * @param inputPdfFile input file - * @param outputPdfFile output file - * @param firstPage begin page - * @param lastPage end page - */ - public static void splitPdf(File inputPdfFile, File outputPdfFile, int firstPage, int lastPage) { - //get Ghostscript instance - Ghostscript gs = Ghostscript.getInstance(); - - //prepare Ghostscript interpreter parameters - //refer to Ghostscript documentation for parameter usage - //gs -sDEVICE=pdfwrite -dNOPAUSE -dQUIET -dBATCH -dFirstPage=m -dLastPage=n -sOutputFile=out.pdf in.pdf - List gsArgs = new ArrayList(); - gsArgs.add("-gs"); - gsArgs.add("-dNOPAUSE"); - gsArgs.add("-dQUIET"); - gsArgs.add("-dBATCH"); - gsArgs.add("-sDEVICE=pdfwrite"); - if (firstPage > 0) { - gsArgs.add("-dFirstPage=" + firstPage); - } - if (lastPage > 0) { - gsArgs.add("-dLastPage=" + lastPage); - } - gsArgs.add("-sOutputFile=" + outputPdfFile.getPath()); - gsArgs.add(inputPdfFile.getPath()); - - //execute and exit interpreter - try { - synchronized (gs) { - gs.initialize(gsArgs.toArray(new String[0])); - gs.exit(); - } - } catch (UnsatisfiedLinkError e) { - logger.error(e.getMessage()); - throw new RuntimeException(getMessage(e.getMessage())); - } catch (NoClassDefFoundError e) { - logger.error(e.getMessage()); - throw new RuntimeException(getMessage(e.getMessage())); - } catch (GhostscriptException e) { - logger.error(e.getMessage()); - throw new RuntimeException(e.getMessage()); - } finally { - //delete interpreter instance (safer) - try { - Ghostscript.deleteInstance(); - } catch (GhostscriptException e) { - //nothing - } - } - } - - /** - * Gets PDF Page Count. - * - * @param inputPdfFile input file - * @return number of pages - */ - public static int getPdfPageCount(File inputPdfFile) { - //get Ghostscript instance - Ghostscript gs = Ghostscript.getInstance(); - - //prepare Ghostscript interpreter parameters - //refer to Ghostscript documentation for parameter usage - //gs -q -dNODISPLAY -c "(input.pdf) (r) file runpdfbegin pdfpagecount = quit" - List gsArgs = new ArrayList(); - gsArgs.add("-gs"); - gsArgs.add("-dNOPAUSE"); - gsArgs.add("-dQUIET"); - gsArgs.add("-dNODISPLAY"); - gsArgs.add("-dBATCH"); - gsArgs.add("-c"); - String cValue = String.format("(%s) (r) file runpdfbegin pdfpagecount = quit", inputPdfFile.getPath().replace('\\', '/')); - gsArgs.add(cValue); - - int pageCount = 0; - ByteArrayOutputStream os; - - //execute and exit interpreter - try { - synchronized (gs) { - //output - os = new ByteArrayOutputStream(); - gs.setStdOut(os); - gs.initialize(gsArgs.toArray(new String[0])); - pageCount = Integer.parseInt(os.toString().trim()); - os.close(); - } - } catch (UnsatisfiedLinkError e) { - logger.error(e.getMessage()); - throw new RuntimeException(getMessage(e.getMessage())); - } catch (NoClassDefFoundError e) { - logger.error(e.getMessage()); - throw new RuntimeException(getMessage(e.getMessage())); - } catch (GhostscriptException e) { - logger.error(e.getMessage()); - throw new RuntimeException(e.getMessage()); - } catch (Exception e) { - logger.error(e.getMessage()); - } finally { - //delete interpreter instance (safer) - try { - Ghostscript.deleteInstance(); - } catch (GhostscriptException e) { - //nothing - } - } - - return pageCount; - } - - /** - * Merges PDF files. - * - * @param inputPdfFiles array of input files - * @param outputPdfFile output file - */ - public static void mergePdf(File[] inputPdfFiles, File outputPdfFile) { - //get Ghostscript instance - Ghostscript gs = Ghostscript.getInstance(); - - //prepare Ghostscript interpreter parameters - //refer to Ghostscript documentation for parameter usage - //gs -sDEVICE=pdfwrite -dNOPAUSE -dQUIET -dBATCH -sOutputFile=out.pdf in1.pdf in2.pdf in3.pdf - List gsArgs = new ArrayList(); - gsArgs.add("-gs"); - gsArgs.add("-dNOPAUSE"); - gsArgs.add("-dQUIET"); - gsArgs.add("-dBATCH"); - gsArgs.add("-sDEVICE=pdfwrite"); - gsArgs.add("-sOutputFile=" + outputPdfFile.getPath()); - - for (File inputPdfFile : inputPdfFiles) { - gsArgs.add(inputPdfFile.getPath()); - } - - //execute and exit interpreter - try { - synchronized (gs) { - gs.initialize(gsArgs.toArray(new String[0])); - gs.exit(); - } - } catch (UnsatisfiedLinkError e) { - logger.error(e.getMessage()); - throw new RuntimeException(getMessage(e.getMessage())); - } catch (NoClassDefFoundError e) { - logger.error(e.getMessage()); - throw new RuntimeException(getMessage(e.getMessage())); - } catch (GhostscriptException e) { - logger.error(e.getMessage()); - throw new RuntimeException(e.getMessage()); - } finally { - //delete interpreter instance (safer) - try { - Ghostscript.deleteInstance(); - } catch (GhostscriptException e) { - //nothing - } - } - } - - static String getMessage(String message) { - if (message.contains("library 'gs") || message.contains("ghost4j")) { - return message + GS_INSTALL; - } - return message; - } -} diff --git a/Tess4J/src/net/sourceforge/tess4j/util/PdfUtilities.java b/Tess4J/src/net/sourceforge/tess4j/util/PdfUtilities.java deleted file mode 100644 index a5635cf..0000000 --- a/Tess4J/src/net/sourceforge/tess4j/util/PdfUtilities.java +++ /dev/null @@ -1,163 +0,0 @@ -/** - * Copyright @ 2009 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package net.sourceforge.tess4j.util; - -import java.io.File; -import java.io.IOException; - -/** - * PDF utilities based on Ghostscript or PDFBox with Ghostscript as default. If - * Ghostscript is not available on the system, then PDFBox is used. Call - * System.setProperty(PDF_LIBRARY, PDFBOX); to set PDFBox as - * default. - */ -public class PdfUtilities { - - public static final String PDF_LIBRARY = "pdf.library"; - public static final String PDFBOX = "pdfbox"; - - /** - * Converts PDF to TIFF format. - * - * @param inputPdfFile input file - * @return a multi-page TIFF image - * @throws IOException - */ - public static File convertPdf2Tiff(File inputPdfFile) throws IOException { - if (PDFBOX.equals(System.getProperty(PDF_LIBRARY))) { - return PdfBoxUtilities.convertPdf2Tiff(inputPdfFile); - } else { - try { - return PdfGsUtilities.convertPdf2Tiff(inputPdfFile); - } catch (Exception e) { - System.setProperty(PDF_LIBRARY, PDFBOX); - return convertPdf2Tiff(inputPdfFile); - } - } - } - - /** - * Converts PDF to PNG format. - * - * @param inputPdfFile input file - * @return an array of PNG images - * @throws java.io.IOException - */ - public static File[] convertPdf2Png(File inputPdfFile) throws IOException { - if (PDFBOX.equals(System.getProperty(PDF_LIBRARY))) { - return PdfBoxUtilities.convertPdf2Png(inputPdfFile); - } else { - try { - return PdfGsUtilities.convertPdf2Png(inputPdfFile); - } catch (Exception e) { - System.setProperty(PDF_LIBRARY, PDFBOX); - return convertPdf2Png(inputPdfFile); - } - } - } - - /** - * Splits PDF. - * - * @deprecated As of Release 3.0. - * - * @param inputPdfFile input file - * @param outputPdfFile output file - * @param firstPage begin page - * @param lastPage end page - */ - public static void splitPdf(String inputPdfFile, String outputPdfFile, String firstPage, String lastPage) { - if (firstPage.trim().isEmpty()) { - firstPage = "0"; - } - if (lastPage.trim().isEmpty()) { - lastPage = "0"; - } - - splitPdf(new File(inputPdfFile), new File(outputPdfFile), Integer.parseInt(firstPage), Integer.parseInt(lastPage)); - } - - /** - * Splits PDF. - * - * @param inputPdfFile input file - * @param outputPdfFile output file - * @param firstPage begin page - * @param lastPage end page - */ - public static void splitPdf(File inputPdfFile, File outputPdfFile, int firstPage, int lastPage) { - if (PDFBOX.equals(System.getProperty(PDF_LIBRARY))) { - PdfBoxUtilities.splitPdf(inputPdfFile, outputPdfFile, firstPage, lastPage); - } else { - try { - PdfGsUtilities.splitPdf(inputPdfFile, outputPdfFile, firstPage, lastPage); - } catch (Exception e) { - System.setProperty(PDF_LIBRARY, PDFBOX); - splitPdf(inputPdfFile, outputPdfFile, firstPage, lastPage); - } - } - } - - /** - * Gets PDF Page Count. - * - * @deprecated As of Release 3.0. - * - * @param inputPdfFile input file - * @return number of pages - */ - public static int getPdfPageCount(String inputPdfFile) { - return getPdfPageCount(new File(inputPdfFile)); - } - - /** - * Gets PDF Page Count. - * - * @param inputPdfFile input file - * @return number of pages - */ - public static int getPdfPageCount(File inputPdfFile) { - if (PDFBOX.equals(System.getProperty(PDF_LIBRARY))) { - return PdfBoxUtilities.getPdfPageCount(inputPdfFile); - } else { - try { - return PdfGsUtilities.getPdfPageCount(inputPdfFile); - } catch (Exception e) { - System.setProperty(PDF_LIBRARY, PDFBOX); - return getPdfPageCount(inputPdfFile); - } - } - } - - /** - * Merges PDF files. - * - * @param inputPdfFiles array of input files - * @param outputPdfFile output file - */ - public static void mergePdf(File[] inputPdfFiles, File outputPdfFile) { - if (PDFBOX.equals(System.getProperty(PDF_LIBRARY))) { - PdfBoxUtilities.mergePdf(inputPdfFiles, outputPdfFile); - } else { - try { - PdfGsUtilities.mergePdf(inputPdfFiles, outputPdfFile); - } catch (Exception e) { - System.setProperty(PDF_LIBRARY, PDFBOX); - mergePdf(inputPdfFiles, outputPdfFile); - } - } - } -} diff --git a/Tess4J/src/net/sourceforge/tess4j/util/Utils.java b/Tess4J/src/net/sourceforge/tess4j/util/Utils.java deleted file mode 100644 index ce42d78..0000000 --- a/Tess4J/src/net/sourceforge/tess4j/util/Utils.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright @ 2013 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package net.sourceforge.tess4j.util; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.lang.reflect.Field; -import java.lang.reflect.Modifier; - -public class Utils { - - /** - * Writes byte array to file. - * - * @param data byte array - * @param outFile output file - * @throws IOException - */ - public static void writeFile(byte[] data, File outFile) throws IOException { - FileOutputStream fos = null; - - try { - // create parent dirs when necessary - if (outFile.getParentFile() != null) { - outFile.getParentFile().mkdirs(); - } - - fos = new FileOutputStream(outFile); - fos.write(data); - } finally { - if (fos != null) { - fos.close(); - } - } - } - - /** - * Gets user-friendly name of the public static final constant defined in a - * class or an interface for display purpose. - * - * @param value the constant value - * @param c type of class or interface - * @return name - */ - public static String getConstantName(Object value, Class c) { - for (Field f : c.getDeclaredFields()) { - int mod = f.getModifiers(); - if (Modifier.isStatic(mod) && Modifier.isPublic(mod) && Modifier.isFinal(mod)) { - try { - if (f.get(null).equals(value)) { - return f.getName(); - } - } catch (IllegalAccessException e) { - return String.valueOf(value); - } - } - } - return String.valueOf(value); - } -} diff --git a/Tess4J/tessdata/configs/api_config b/Tess4J/tessdata/configs/api_config deleted file mode 100644 index 5cd6ec0..0000000 --- a/Tess4J/tessdata/configs/api_config +++ /dev/null @@ -1 +0,0 @@ -tessedit_zero_rejection T diff --git a/Tess4J/tessdata/configs/digits b/Tess4J/tessdata/configs/digits deleted file mode 100644 index 6a329f8..0000000 --- a/Tess4J/tessdata/configs/digits +++ /dev/null @@ -1 +0,0 @@ -tessedit_char_whitelist 0123456789-. diff --git a/Tess4J/tessdata/configs/hocr b/Tess4J/tessdata/configs/hocr deleted file mode 100644 index 72f83e8..0000000 --- a/Tess4J/tessdata/configs/hocr +++ /dev/null @@ -1 +0,0 @@ -tessedit_create_hocr 1 \ No newline at end of file diff --git a/Tess4J/tessdata/eng.traineddata b/Tess4J/tessdata/eng.traineddata deleted file mode 100644 index 561883f..0000000 Binary files a/Tess4J/tessdata/eng.traineddata and /dev/null differ diff --git a/Tess4J/tessdata/osd.traineddata b/Tess4J/tessdata/osd.traineddata deleted file mode 100644 index 527457c..0000000 Binary files a/Tess4J/tessdata/osd.traineddata and /dev/null differ diff --git a/Tess4J/tessdata/pdf.ttf b/Tess4J/tessdata/pdf.ttf deleted file mode 100644 index eb359b3..0000000 Binary files a/Tess4J/tessdata/pdf.ttf and /dev/null differ diff --git a/Tess4J/tessdata/pdf.ttx b/Tess4J/tessdata/pdf.ttx deleted file mode 100644 index c6db1c8..0000000 --- a/Tess4J/tessdata/pdf.ttx +++ /dev/null @@ -1,793 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/Tess4J/test/net/sourceforge/tess4j/ProgressMonitor.java b/Tess4J/test/net/sourceforge/tess4j/ProgressMonitor.java deleted file mode 100644 index 4a2fac6..0000000 --- a/Tess4J/test/net/sourceforge/tess4j/ProgressMonitor.java +++ /dev/null @@ -1,75 +0,0 @@ -/** - * Copyright @ 2014 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package net.sourceforge.tess4j; - -import com.sun.jna.Pointer; -import net.sourceforge.tess4j.util.LoggHelper; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import static net.sourceforge.tess4j.ITessAPI.TRUE; - -class ProgressMonitor extends Thread { - - ITessAPI.ETEXT_DESC monitor; - StringBuilder outputMessage = new StringBuilder(); - - private static final Logger logger = LoggerFactory.getLogger(new LoggHelper().toString()); - - public ProgressMonitor(ITessAPI.ETEXT_DESC monitor) { - this.monitor = monitor; - } - - public String getMessage() { - return outputMessage.toString(); - } - - @Override - public void run() { - try { - while (true) { - logger.info("ocr alive: " + (monitor.ocr_alive == TRUE)); - logger.info("progress: " + monitor.progress); - outputMessage.append(monitor.more_to_come); - if (monitor.progress >= 100) { - break; - } - Thread.sleep(100); - } - } catch (Exception ioe) { - ioe.printStackTrace(); - } - } - - /** - * Cancels OCR operation. - */ - public void cancel() { - monitor.cancel = new ITessAPI.CANCEL_FUNC() { - @Override - public boolean invoke(Pointer cancel_this, int words) { - return true; - } - }; - } - - /** - * Resets cancel flag. - */ - public void reset() { - monitor.cancel = null; - } -} diff --git a/Tess4J/test/net/sourceforge/tess4j/TessAPI1Test.java b/Tess4J/test/net/sourceforge/tess4j/TessAPI1Test.java deleted file mode 100644 index 43a05fe..0000000 --- a/Tess4J/test/net/sourceforge/tess4j/TessAPI1Test.java +++ /dev/null @@ -1,645 +0,0 @@ -/** - * Copyright @ 2012 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package net.sourceforge.tess4j; - -import static org.junit.Assert.assertArrayEquals; - -import java.awt.image.BufferedImage; -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileReader; -import java.nio.ByteBuffer; -import java.nio.FloatBuffer; -import java.nio.IntBuffer; -import java.util.Arrays; - -import javax.imageio.ImageIO; - -import net.sourceforge.tess4j.util.LoggHelper; -import net.sourceforge.tess4j.util.Utils; -import net.sourceforge.tess4j.util.ImageIOHelper; - -import com.ochafik.lang.jnaerator.runtime.NativeSize; -import com.sun.jna.NativeLong; - -import org.junit.After; -import org.junit.AfterClass; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Test; - -import com.sun.jna.Pointer; -import com.sun.jna.StringArray; -import com.sun.jna.ptr.PointerByReference; -import net.sourceforge.lept4j.Box; -import net.sourceforge.lept4j.Boxa; -import static net.sourceforge.lept4j.ILeptonica.L_CLONE; -import net.sourceforge.lept4j.Leptonica; -import net.sourceforge.lept4j.Leptonica1; -import net.sourceforge.lept4j.Pix; -import net.sourceforge.lept4j.util.LeptUtils; - -import net.sourceforge.tess4j.ITessAPI.*; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import static net.sourceforge.tess4j.ITessAPI.FALSE; -import static net.sourceforge.tess4j.ITessAPI.TRUE; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -public class TessAPI1Test { - - private static final Logger logger = LoggerFactory.getLogger(new LoggHelper().toString()); - private final String datapath = "."; - private final String testResourcesDataPath = "test/resources/test-data"; - String language = "eng"; - String expOCRResult = "The (quick) [brown] {fox} jumps!\nOver the $43,456.78 #90 dog"; - - TessBaseAPI handle; - - @BeforeClass - public static void setUpClass() throws Exception { - } - - @AfterClass - public static void tearDownClass() throws Exception { - } - - @Before - public void setUp() { - handle = TessAPI1.TessBaseAPICreate(); - } - - @After - public void tearDown() { - TessAPI1.TessBaseAPIDelete(handle); - } - - /** - * Test of TessBaseAPIRect method, of class TessAPI1. - * - * @throws java.lang.Exception - */ - @Test - public void testTessBaseAPIRect() throws Exception { - logger.info("TessBaseAPIRect"); - String expResult = expOCRResult; - File tiff = new File(testResourcesDataPath, "eurotext.tif"); - BufferedImage image = ImageIO.read(tiff); // require jai-imageio lib to read TIFF - ByteBuffer buf = ImageIOHelper.convertImageData(image); - int bpp = image.getColorModel().getPixelSize(); - int bytespp = bpp / 8; - int bytespl = (int) Math.ceil(image.getWidth() * bpp / 8.0); - TessAPI1.TessBaseAPIInit3(handle, datapath, language); - TessAPI1.TessBaseAPISetPageSegMode(handle, TessPageSegMode.PSM_AUTO); - Pointer utf8Text = TessAPI1.TessBaseAPIRect(handle, buf, bytespp, bytespl, 0, 0, image.getWidth(), image.getHeight()); - String result = utf8Text.getString(0); - TessAPI1.TessDeleteText(utf8Text); - logger.info(result); - assertEquals(expResult, result.substring(0, expResult.length())); - } - - /** - * Test of TessBaseAPIGetUTF8Text method, of class TessAPI1. - * - * @throws java.lang.Exception - */ - @Test - public void testTessBaseAPIGetUTF8Text() throws Exception { - logger.info("TessBaseAPIGetUTF8Text"); - String expResult = expOCRResult; - File tiff = new File(testResourcesDataPath, "eurotext.tif"); - BufferedImage image = ImageIO.read(new FileInputStream(tiff)); // require jai-imageio lib to read TIFF - ByteBuffer buf = ImageIOHelper.convertImageData(image); - int bpp = image.getColorModel().getPixelSize(); - int bytespp = bpp / 8; - int bytespl = (int) Math.ceil(image.getWidth() * bpp / 8.0); - TessAPI1.TessBaseAPIInit3(handle, datapath, language); - TessAPI1.TessBaseAPISetPageSegMode(handle, TessPageSegMode.PSM_AUTO); - TessAPI1.TessBaseAPISetImage(handle, buf, image.getWidth(), image.getHeight(), bytespp, bytespl); - TessAPI1.TessBaseAPISetRectangle(handle, 0, 0, 1024, 800); - Pointer utf8Text = TessAPI1.TessBaseAPIGetUTF8Text(handle); - String result = utf8Text.getString(0); - TessAPI1.TessDeleteText(utf8Text); - logger.info(result); - assertEquals(expResult, result.substring(0, expResult.length())); - } - - /** - * Test of TessBaseAPIGetUTF8Text method, of class TessAPI1. - * - * @throws java.lang.Exception - */ - @Test - public void testTessBaseAPIGetUTF8Text_Pix() throws Exception { - logger.info("TessBaseAPIGetUTF8Text_Pix"); - String expResult = expOCRResult; - File tiff = new File(testResourcesDataPath, "eurotext.tif"); - Leptonica leptInstance = Leptonica.INSTANCE; - Pix pix = leptInstance.pixRead(tiff.getPath()); - TessAPI1.TessBaseAPIInit3(handle, datapath, language); - TessAPI1.TessBaseAPISetImage2(handle, pix); - Pointer utf8Text = TessAPI1.TessBaseAPIGetUTF8Text(handle); - String result = utf8Text.getString(0); - TessAPI1.TessDeleteText(utf8Text); - logger.info(result); - - //release Pix resource - PointerByReference pRef = new PointerByReference(); - pRef.setValue(pix.getPointer()); - leptInstance.pixDestroy(pRef); - - assertEquals(expResult, result.substring(0, expResult.length())); - } - - /** - * Test of TessBaseAPIGetComponentImages method, of class TessAPI1. - * - * @throws java.lang.Exception - */ - @Test - public void testTessBaseAPIGetComponentImages() throws Exception { - logger.info("TessBaseAPIGetComponentImages"); - File image = new File(testResourcesDataPath, "eurotext.png"); - int expResult = 12; // number of lines in the test image - Pix pix = Leptonica1.pixRead(image.getPath()); - TessAPI1.TessBaseAPIInit3(handle, datapath, language); - TessAPI1.TessBaseAPISetImage2(handle, pix); - PointerByReference pixa = null; - PointerByReference blockids = null; - Boxa boxes = TessAPI1.TessBaseAPIGetComponentImages(handle, TessPageIteratorLevel.RIL_TEXTLINE, TRUE, pixa, blockids); -// boxes = TessAPI1.TessBaseAPIGetRegions(handle, pixa); // equivalent to TessPageIteratorLevel.RIL_BLOCK - int boxCount = Leptonica1.boxaGetCount(boxes); - for (int i = 0; i < boxCount; i++) { - Box box = Leptonica1.boxaGetBox(boxes, i, L_CLONE); - if (box == null) { - continue; - } - TessAPI1.TessBaseAPISetRectangle(handle, box.x, box.y, box.w, box.h); - Pointer utf8Text = TessAPI1.TessBaseAPIGetUTF8Text(handle); - String ocrResult = utf8Text.getString(0); - TessAPI1.TessDeleteText(utf8Text); - int conf = TessAPI1.TessBaseAPIMeanTextConf(handle); - System.out.print(String.format("Box[%d]: x=%d, y=%d, w=%d, h=%d, confidence: %d, text: %s", i, box.x, box.y, box.w, box.h, conf, ocrResult)); - LeptUtils.dispose(box); - } - - // release Pix and Boxa resources - LeptUtils.dispose(pix); - LeptUtils.dispose(boxes); - - assertEquals(expResult, boxCount); - } - - /** - * Test of TessVersion method, of class TessAPI1. - */ - @Test - public void testTessVersion() { - logger.info("TessVersion"); - String expResult = "3.05.01"; - String result = TessAPI1.TessVersion(); - logger.info(result); - assertTrue(result.startsWith(expResult)); - } - - /** - * Test of TessBaseAPIGetBoolVariable method, of class TessAPI1. - */ - @Test - public void testTessBaseAPIGetBoolVariable() { - logger.info("TessBaseAPIGetBoolVariable"); - String name = "tessedit_create_hocr"; - TessAPI1.TessBaseAPISetVariable(handle, name, "1"); - IntBuffer value = IntBuffer.allocate(1); - int result = -1; - if (TessAPI1.TessBaseAPIGetBoolVariable(handle, "tessedit_create_hocr", value) == TRUE) { - result = value.get(0); - } - int expResult = 1; - assertEquals(expResult, result); - } - - /** - * Test of TessBaseAPIPrintVariables method, of class TessAPI1. - * - * @throws java.lang.Exception - */ - @Test - public void testTessBaseAPIPrintVariablesToFile() throws Exception { - logger.info("TessBaseAPIPrintVariablesToFile"); - String var = "tessedit_char_whitelist"; - String value = "0123456789"; - TessAPI1.TessBaseAPISetVariable(handle, var, value); - String filename = "printvar.txt"; - TessAPI1.TessBaseAPIPrintVariablesToFile(handle, filename); // will crash if not invoked after some method - File file = new File(filename); - BufferedReader input = new BufferedReader(new FileReader(file)); - StringBuilder strB = new StringBuilder(); - String line; - String EOL = System.getProperty("line.separator"); - while ((line = input.readLine()) != null) { - strB.append(line).append(EOL); - } - input.close(); - file.delete(); - assertTrue(strB.toString().contains(var + "\t" + value)); - } - - /** - * Test of TessBaseAPIInit4 method, of class TessAPI1. - */ - @Test - public void testTessBaseAPIInit4() { - logger.info("TessBaseAPIInit4"); - int oem = TessOcrEngineMode.OEM_DEFAULT; - PointerByReference configs = null; - int configs_size = 0; - - // disable loading dictionaries - String[] args = new String[]{"load_system_dawg", "load_freq_dawg"}; - StringArray sarray = new StringArray(args); - PointerByReference vars_vec = new PointerByReference(); - vars_vec.setPointer(sarray); - - args = new String[]{"F", "F"}; - sarray = new StringArray(args); - PointerByReference vars_values = new PointerByReference(); - vars_values.setPointer(sarray); - - NativeSize vars_vec_size = new NativeSize(args.length); - - int expResult = 0; - int result = TessAPI1.TessBaseAPIInit4(handle, datapath, language, oem, configs, configs_size, vars_vec, vars_values, vars_vec_size, FALSE); - assertEquals(expResult, result); - } - - /** - * Test of TessBaseAPIGetInitLanguagesAsString method, of class TessAPI1. - */ - @Test - public void testTessBaseAPIGetInitLanguagesAsString() { - logger.info("TessBaseAPIGetInitLanguagesAsString"); - String expResult = ""; - String result = TessAPI1.TessBaseAPIGetInitLanguagesAsString(handle); - assertEquals(expResult, result); - } - - /** - * Test of TessBaseAPIGetLoadedLanguagesAsVector method, of class TessAPI1. - */ - @Test - public void testTessBaseAPIGetLoadedLanguagesAsVector() { - logger.info("TessBaseAPIGetLoadedLanguagesAsVector"); - TessAPI1.TessBaseAPIInit3(handle, datapath, language); - String[] expResult = {"eng"}; - String[] result = TessAPI1.TessBaseAPIGetLoadedLanguagesAsVector(handle).getPointer().getStringArray(0); - assertArrayEquals(expResult, result); - } - - /** - * Test of TessBaseAPIGetAvailableLanguagesAsVector method, of class - * TessAPI1. - */ - @Test - public void testTessBaseAPIGetAvailableLanguagesAsVector() { - logger.info("TessBaseAPIGetAvailableLanguagesAsVector"); - TessAPI1.TessBaseAPIInit3(handle, datapath, language); - String[] expResult = {"eng"}; - String[] result = TessAPI1.TessBaseAPIGetAvailableLanguagesAsVector(handle).getPointer().getStringArray(0); - assertTrue(Arrays.asList(result).containsAll(Arrays.asList(expResult))); - } - - /** - * Test of TessBaseAPIGetHOCRText method, of class TessAPI1. - * - * @throws java.lang.Exception - */ - @Test - public void testTessBaseAPIGetHOCRText() throws Exception { - logger.info("TessBaseAPIGetHOCRText"); - File tiff = new File(testResourcesDataPath, "eurotext.tif"); - BufferedImage image = ImageIO.read(new FileInputStream(tiff)); // require jai-imageio lib to read TIFF - ByteBuffer buf = ImageIOHelper.convertImageData(image); - int bpp = image.getColorModel().getPixelSize(); - int bytespp = bpp / 8; - int bytespl = (int) Math.ceil(image.getWidth() * bpp / 8.0); - TessAPI1.TessBaseAPISetPageSegMode(handle, TessPageSegMode.PSM_AUTO); - TessAPI1.TessBaseAPIInit3(handle, datapath, language); - TessAPI1.TessBaseAPISetImage(handle, buf, image.getWidth(), image.getHeight(), bytespp, bytespl); - int page_number = 0; - Pointer utf8Text = TessAPI1.TessBaseAPIGetHOCRText(handle, page_number); - String result = utf8Text.getString(0); - TessAPI1.TessDeleteText(utf8Text); - assertTrue(result.contains("
0 causes blank ouput - monitor.end_time = timeout; - ProgressMonitor pmo = new ProgressMonitor(monitor); - pmo.start(); - TessAPI1.TessBaseAPIRecognize(handle, monitor); - logger.info("Message: " + pmo.getMessage()); - TessResultIterator ri = TessAPI1.TessBaseAPIGetIterator(handle); - TessPageIterator pi = TessAPI1.TessResultIteratorGetPageIterator(ri); - TessAPI1.TessPageIteratorBegin(pi); - logger.info("Bounding boxes:\nchar(s) left top right bottom confidence font-attributes"); - int level = TessPageIteratorLevel.RIL_WORD; - - // int height = image.getHeight(); - do { - Pointer ptr = TessAPI1.TessResultIteratorGetUTF8Text(ri, level); - String word = ptr.getString(0); - TessAPI1.TessDeleteText(ptr); - float confidence = TessAPI1.TessResultIteratorConfidence(ri, level); - IntBuffer leftB = IntBuffer.allocate(1); - IntBuffer topB = IntBuffer.allocate(1); - IntBuffer rightB = IntBuffer.allocate(1); - IntBuffer bottomB = IntBuffer.allocate(1); - TessAPI1.TessPageIteratorBoundingBox(pi, level, leftB, topB, rightB, bottomB); - int left = leftB.get(); - int top = topB.get(); - int right = rightB.get(); - int bottom = bottomB.get(); - System.out.print(String.format("%s %d %d %d %d %f", word, left, top, right, bottom, confidence)); - // logger.info(String.format("%s %d %d %d %d", str, left, height - bottom, right, height - top)); // - // training box coordinates - - IntBuffer boldB = IntBuffer.allocate(1); - IntBuffer italicB = IntBuffer.allocate(1); - IntBuffer underlinedB = IntBuffer.allocate(1); - IntBuffer monospaceB = IntBuffer.allocate(1); - IntBuffer serifB = IntBuffer.allocate(1); - IntBuffer smallcapsB = IntBuffer.allocate(1); - IntBuffer pointSizeB = IntBuffer.allocate(1); - IntBuffer fontIdB = IntBuffer.allocate(1); - String fontName = TessAPI1.TessResultIteratorWordFontAttributes(ri, boldB, italicB, underlinedB, - monospaceB, serifB, smallcapsB, pointSizeB, fontIdB); - boolean bold = boldB.get() == TRUE; - boolean italic = italicB.get() == TRUE; - boolean underlined = underlinedB.get() == TRUE; - boolean monospace = monospaceB.get() == TRUE; - boolean serif = serifB.get() == TRUE; - boolean smallcaps = smallcapsB.get() == TRUE; - int pointSize = pointSizeB.get(); - int fontId = fontIdB.get(); - logger.info(String.format(" font: %s, size: %d, font id: %d, bold: %b," - + " italic: %b, underlined: %b, monospace: %b, serif: %b, smallcap: %b", fontName, pointSize, - fontId, bold, italic, underlined, monospace, serif, smallcaps)); - } while (TessAPI1.TessPageIteratorNext(pi, level) == TRUE); - - assertTrue(true); - } - - /** - * Test of ChoiceIterator. - * - * @throws Exception - */ - @Test - public void testChoiceIterator() throws Exception { - logger.info("TessResultIteratorGetChoiceIterator"); - File tiff = new File(testResourcesDataPath, "eurotext.tif"); - BufferedImage image = ImageIO.read(new FileInputStream(tiff)); // require jai-imageio lib to read TIFF - ByteBuffer buf = ImageIOHelper.convertImageData(image); - int bpp = image.getColorModel().getPixelSize(); - int bytespp = bpp / 8; - int bytespl = (int) Math.ceil(image.getWidth() * bpp / 8.0); - TessAPI1.TessBaseAPIInit3(handle, datapath, language); - TessAPI1.TessBaseAPISetImage(handle, buf, image.getWidth(), image.getHeight(), bytespp, bytespl); - TessAPI1.TessBaseAPISetVariable(handle, "save_blob_choices", "T"); - TessAPI1.TessBaseAPISetRectangle(handle, 37, 228, 548, 31); - ETEXT_DESC monitor = new ETEXT_DESC(); - ProgressMonitor pmo = new ProgressMonitor(monitor); - pmo.start(); - TessAPI1.TessBaseAPIRecognize(handle, monitor); - logger.info("Message: " + pmo.getMessage()); - TessResultIterator ri = TessAPI1.TessBaseAPIGetIterator(handle); - int level = TessPageIteratorLevel.RIL_SYMBOL; - - if (ri != null) { - do { - Pointer symbol = TessAPI1.TessResultIteratorGetUTF8Text(ri, level); - float conf = TessAPI1.TessResultIteratorConfidence(ri, level); - if (symbol != null) { - logger.info(String.format("symbol %s, conf: %f", symbol.getString(0), conf)); - boolean indent = false; - TessChoiceIterator ci = TessAPI1.TessResultIteratorGetChoiceIterator(ri); - do { - if (indent) { - System.out.print("\t"); - } - System.out.print("\t- "); - String choice = TessAPI1.TessChoiceIteratorGetUTF8Text(ci); - logger.info(String.format("%s conf: %f", choice, TessAPI1.TessChoiceIteratorConfidence(ci))); - indent = true; - } while (TessAPI1.TessChoiceIteratorNext(ci) == ITessAPI.TRUE); - TessAPI1.TessChoiceIteratorDelete(ci); - } - logger.info("---------------------------------------------"); - TessAPI1.TessDeleteText(symbol); - } while (TessAPI1.TessResultIteratorNext(ri, level) == ITessAPI.TRUE); - } - - assertTrue(true); - } - - /** - * Test of ResultRenderer method, of class TessAPI1. - * - * @throws java.lang.Exception - */ - @Test - public void testResultRenderer() throws Exception { - logger.info("TessResultRenderer"); - String image = String.format("%s/%s", testResourcesDataPath, "eurotext.tif"); - String output = "capi-test.txt"; - int set_only_init_params = ITessAPI.FALSE; - int oem = TessOcrEngineMode.OEM_DEFAULT; - PointerByReference configs = null; - int configs_size = 0; - - String[] params = {"load_system_dawg", "tessedit_char_whitelist"}; - String vals[] = {"F", ""}; //0123456789-.IThisalotfpnex - PointerByReference vars_vec = new PointerByReference(); - vars_vec.setPointer(new StringArray(params)); - PointerByReference vars_values = new PointerByReference(); - vars_values.setPointer(new StringArray(vals)); - NativeSize vars_vec_size = new NativeSize(params.length); - - TessAPI1.TessBaseAPISetOutputName(handle, output); - - int rc = TessAPI1.TessBaseAPIInit4(handle, datapath, language, - oem, configs, configs_size, vars_vec, vars_values, vars_vec_size, set_only_init_params); - - if (rc != 0) { - TessAPI1.TessBaseAPIDelete(handle); - logger.error("Could not initialize tesseract."); - return; - } - - String outputbase = "test/test-results/outputbase1"; - TessResultRenderer renderer = TessAPI1.TessHOcrRendererCreate(outputbase); - TessAPI1.TessResultRendererInsert(renderer, TessAPI1.TessBoxTextRendererCreate(outputbase)); - TessAPI1.TessResultRendererInsert(renderer, TessAPI1.TessTextRendererCreate(outputbase)); - String dataPath = TessAPI1.TessBaseAPIGetDatapath(handle); - TessAPI1.TessResultRendererInsert(renderer, TessAPI1.TessPDFRendererCreate(outputbase, dataPath)); - int result = TessAPI1.TessBaseAPIProcessPages(handle, image, null, 0, renderer); - -// if (result == FALSE) { -// logger.error("Error during processing."); -// return; -// } - for (; renderer != null; renderer = TessAPI1.TessResultRendererNext(renderer)) { - String ext = TessAPI1.TessResultRendererExtention(renderer).getString(0); - logger.info(String.format("TessResultRendererExtention: %s\nTessResultRendererTitle: %s\nTessResultRendererImageNum: %d", - ext, - TessAPI1.TessResultRendererTitle(renderer).getString(0), - TessAPI1.TessResultRendererImageNum(renderer))); - } - - TessAPI1.TessDeleteResultRenderer(renderer); - assertTrue(new File(outputbase + ".pdf").exists()); - } -} diff --git a/Tess4J/test/net/sourceforge/tess4j/TessAPIImpl.java b/Tess4J/test/net/sourceforge/tess4j/TessAPIImpl.java deleted file mode 100644 index 3836e8c..0000000 --- a/Tess4J/test/net/sourceforge/tess4j/TessAPIImpl.java +++ /dev/null @@ -1,625 +0,0 @@ -/* - * Copyright @ 2017 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package net.sourceforge.tess4j; - -import com.ochafik.lang.jnaerator.runtime.NativeSize; -import com.sun.jna.Pointer; -import com.sun.jna.ptr.IntByReference; -import com.sun.jna.ptr.PointerByReference; -import java.nio.ByteBuffer; -import java.nio.DoubleBuffer; -import java.nio.FloatBuffer; -import java.nio.IntBuffer; -import net.sourceforge.lept4j.Boxa; -import net.sourceforge.lept4j.Pix; - -public class TessAPIImpl implements TessAPI { - - public TessAPI getInstance() { - return TessAPI.INSTANCE; - } - - public void TessAPIEndPage() { - } - - public void TessAPIRelease() { - } - - @Override - public String TessVersion() { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public void TessDeleteText(Pointer text) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public void TessDeleteTextArray(PointerByReference arr) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public void TessDeleteIntArray(IntBuffer arr) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public ITessAPI.TessResultRenderer TessTextRendererCreate(String outputbase) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public ITessAPI.TessResultRenderer TessHOcrRendererCreate(String outputbase) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public TessResultRenderer TessHOcrRendererCreate2(String outputbase, int font_info) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public ITessAPI.TessResultRenderer TessPDFRendererCreate(String outputbase, String datadir) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public TessResultRenderer TessPDFRendererCreateTextonly(String outputbase, String datadir, int textonly) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public ITessAPI.TessResultRenderer TessUnlvRendererCreate(String outputbase) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public ITessAPI.TessResultRenderer TessBoxTextRendererCreate(String outputbase) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public void TessDeleteResultRenderer(ITessAPI.TessResultRenderer renderer) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public void TessResultRendererInsert(ITessAPI.TessResultRenderer renderer, ITessAPI.TessResultRenderer next) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public ITessAPI.TessResultRenderer TessResultRendererNext(ITessAPI.TessResultRenderer renderer) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessResultRendererBeginDocument(ITessAPI.TessResultRenderer renderer, String title) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessResultRendererAddImage(ITessAPI.TessResultRenderer renderer, PointerByReference api) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessResultRendererEndDocument(ITessAPI.TessResultRenderer renderer) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public Pointer TessResultRendererExtention(ITessAPI.TessResultRenderer renderer) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public Pointer TessResultRendererTitle(ITessAPI.TessResultRenderer renderer) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessResultRendererImageNum(ITessAPI.TessResultRenderer renderer) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public ITessAPI.TessBaseAPI TessBaseAPICreate() { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public void TessBaseAPIDelete(ITessAPI.TessBaseAPI handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public void TessBaseAPISetInputName(ITessAPI.TessBaseAPI handle, String name) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public String TessBaseAPIGetInputName(ITessAPI.TessBaseAPI handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public void TessBaseAPISetInputImage(ITessAPI.TessBaseAPI handle, Pix pix) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public Pix TessBaseAPIGetInputImage(ITessAPI.TessBaseAPI handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessBaseAPIGetSourceYResolution(ITessAPI.TessBaseAPI handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public String TessBaseAPIGetDatapath(ITessAPI.TessBaseAPI handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public void TessBaseAPISetOutputName(ITessAPI.TessBaseAPI handle, String name) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessBaseAPISetVariable(ITessAPI.TessBaseAPI handle, String name, String value) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessBaseAPIGetIntVariable(ITessAPI.TessBaseAPI handle, String name, IntBuffer value) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessBaseAPIGetBoolVariable(ITessAPI.TessBaseAPI handle, String name, IntBuffer value) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessBaseAPIGetDoubleVariable(ITessAPI.TessBaseAPI handle, String name, DoubleBuffer value) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public String TessBaseAPIGetStringVariable(ITessAPI.TessBaseAPI handle, String name) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public void TessBaseAPIPrintVariablesToFile(ITessAPI.TessBaseAPI handle, String filename) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessBaseAPIInit1(ITessAPI.TessBaseAPI handle, String datapath, String language, int oem, PointerByReference configs, int configs_size) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessBaseAPIInit2(ITessAPI.TessBaseAPI handle, String datapath, String language, int oem) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessBaseAPIInit3(ITessAPI.TessBaseAPI handle, String datapath, String language) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessBaseAPIInit4(ITessAPI.TessBaseAPI handle, String datapath, String language, int oem, PointerByReference configs, int configs_size, PointerByReference vars_vec, PointerByReference vars_values, NativeSize vars_vec_size, int set_only_non_debug_params) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public String TessBaseAPIGetInitLanguagesAsString(ITessAPI.TessBaseAPI handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public PointerByReference TessBaseAPIGetLoadedLanguagesAsVector(ITessAPI.TessBaseAPI handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public PointerByReference TessBaseAPIGetAvailableLanguagesAsVector(ITessAPI.TessBaseAPI handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessBaseAPIInitLangMod(ITessAPI.TessBaseAPI handle, String datapath, String language) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public void TessBaseAPIInitForAnalysePage(ITessAPI.TessBaseAPI handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public void TessBaseAPIReadConfigFile(ITessAPI.TessBaseAPI handle, String filename, int init_only) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public void TessBaseAPISetPageSegMode(ITessAPI.TessBaseAPI handle, int mode) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessBaseAPIGetPageSegMode(ITessAPI.TessBaseAPI handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public Pointer TessBaseAPIRect(ITessAPI.TessBaseAPI handle, ByteBuffer imagedata, int bytes_per_pixel, int bytes_per_line, int left, int top, int width, int height) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public void TessBaseAPIClearAdaptiveClassifier(ITessAPI.TessBaseAPI handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public void TessBaseAPISetImage(ITessAPI.TessBaseAPI handle, ByteBuffer imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public void TessBaseAPISetImage2(ITessAPI.TessBaseAPI handle, Pix pix) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public void TessBaseAPISetSourceResolution(ITessAPI.TessBaseAPI handle, int ppi) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public void TessBaseAPISetRectangle(ITessAPI.TessBaseAPI handle, int left, int top, int width, int height) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public Pix TessBaseAPIGetThresholdedImage(ITessAPI.TessBaseAPI handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public Boxa TessBaseAPIGetRegions(ITessAPI.TessBaseAPI handle, PointerByReference pixa) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public Boxa TessBaseAPIGetTextlines(ITessAPI.TessBaseAPI handle, PointerByReference pixa, PointerByReference blockids) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public Boxa TessBaseAPIGetTextlines1(ITessAPI.TessBaseAPI handle, int raw_image, int raw_padding, PointerByReference pixa, PointerByReference blockids, PointerByReference paraids) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public Boxa TessBaseAPIGetStrips(ITessAPI.TessBaseAPI handle, PointerByReference pixa, PointerByReference blockids) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public Boxa TessBaseAPIGetWords(ITessAPI.TessBaseAPI handle, PointerByReference pixa) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public Boxa TessBaseAPIGetConnectedComponents(ITessAPI.TessBaseAPI handle, PointerByReference cc) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public Boxa TessBaseAPIGetComponentImages(ITessAPI.TessBaseAPI handle, int level, int text_only, PointerByReference pixa, PointerByReference blockids) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public Boxa TessBaseAPIGetComponentImages1(ITessAPI.TessBaseAPI handle, int level, int text_only, int raw_image, int raw_padding, PointerByReference pixa, PointerByReference blockids, PointerByReference paraids) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessBaseAPIGetThresholdedImageScaleFactor(ITessAPI.TessBaseAPI handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public void TessBaseAPIDumpPGM(ITessAPI.TessBaseAPI handle, String filename) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public ITessAPI.TessPageIterator TessBaseAPIAnalyseLayout(ITessAPI.TessBaseAPI handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessBaseAPIRecognize(ITessAPI.TessBaseAPI handle, ITessAPI.ETEXT_DESC monitor) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessBaseAPIRecognizeForChopTest(ITessAPI.TessBaseAPI handle, ITessAPI.ETEXT_DESC monitor) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public ITessAPI.TessResultIterator TessBaseAPIGetIterator(ITessAPI.TessBaseAPI handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public ITessAPI.TessMutableIterator TessBaseAPIGetMutableIterator(ITessAPI.TessBaseAPI handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessBaseAPIProcessPages(ITessAPI.TessBaseAPI handle, String filename, String retry_config, int timeout_millisec, ITessAPI.TessResultRenderer renderer) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessBaseAPIProcessPage(ITessAPI.TessBaseAPI handle, Pix pix, int page_index, String filename, String retry_config, int timeout_millisec, ITessAPI.TessResultRenderer renderer) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public Pointer TessBaseAPIGetUTF8Text(ITessAPI.TessBaseAPI handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public Pointer TessBaseAPIGetHOCRText(ITessAPI.TessBaseAPI handle, int page_number) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public Pointer TessBaseAPIGetBoxText(ITessAPI.TessBaseAPI handle, int page_number) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public Pointer TessBaseAPIGetUNLVText(ITessAPI.TessBaseAPI handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessBaseAPIMeanTextConf(ITessAPI.TessBaseAPI handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public IntByReference TessBaseAPIAllWordConfidences(ITessAPI.TessBaseAPI handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessBaseAPIAdaptToWordStr(ITessAPI.TessBaseAPI handle, int mode, String wordstr) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public void TessBaseAPIClear(ITessAPI.TessBaseAPI handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public void TessBaseAPIEnd(ITessAPI.TessBaseAPI handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessBaseAPIIsValidWord(ITessAPI.TessBaseAPI handle, String word) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessBaseAPIGetTextDirection(ITessAPI.TessBaseAPI handle, IntBuffer out_offset, FloatBuffer out_slope) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public void TessBaseAPIClearPersistentCache(ITessAPI.TessBaseAPI handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessBaseAPIDetectOrientationScript(TessBaseAPI handle, IntBuffer orient_deg, FloatBuffer orient_conf, PointerByReference script_name, FloatBuffer script_conf) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public String TessBaseAPIGetUnichar(ITessAPI.TessBaseAPI handle, int unichar_id) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public void TessPageIteratorDelete(ITessAPI.TessPageIterator handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public ITessAPI.TessPageIterator TessPageIteratorCopy(ITessAPI.TessPageIterator handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public void TessPageIteratorBegin(ITessAPI.TessPageIterator handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessPageIteratorNext(ITessAPI.TessPageIterator handle, int level) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessPageIteratorIsAtBeginningOf(ITessAPI.TessPageIterator handle, int level) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessPageIteratorIsAtFinalElement(ITessAPI.TessPageIterator handle, int level, int element) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessPageIteratorBoundingBox(ITessAPI.TessPageIterator handle, int level, IntBuffer left, IntBuffer top, IntBuffer right, IntBuffer bottom) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessPageIteratorBlockType(ITessAPI.TessPageIterator handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public Pix TessPageIteratorGetBinaryImage(ITessAPI.TessPageIterator handle, int level) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public Pix TessPageIteratorGetImage(ITessAPI.TessPageIterator handle, int level, int padding, Pix original_image, IntBuffer left, IntBuffer top) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessPageIteratorBaseline(ITessAPI.TessPageIterator handle, int level, IntBuffer x1, IntBuffer y1, IntBuffer x2, IntBuffer y2) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public void TessPageIteratorOrientation(ITessAPI.TessPageIterator handle, IntBuffer orientation, IntBuffer writing_direction, IntBuffer textline_order, FloatBuffer deskew_angle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public void TessPageIteratorParagraphInfo(ITessAPI.TessPageIterator handle, IntBuffer justification, IntBuffer is_list_item, IntBuffer is_crown, IntBuffer first_line_indent) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public void TessResultIteratorDelete(ITessAPI.TessResultIterator handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public ITessAPI.TessResultIterator TessResultIteratorCopy(ITessAPI.TessResultIterator handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public ITessAPI.TessPageIterator TessResultIteratorGetPageIterator(ITessAPI.TessResultIterator handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public ITessAPI.TessPageIterator TessResultIteratorGetPageIteratorConst(ITessAPI.TessResultIterator handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessResultIteratorNext(ITessAPI.TessResultIterator handle, int level) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public Pointer TessResultIteratorGetUTF8Text(ITessAPI.TessResultIterator handle, int level) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public float TessResultIteratorConfidence(ITessAPI.TessResultIterator handle, int level) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public String TessResultIteratorWordRecognitionLanguage(ITessAPI.TessResultIterator handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public String TessResultIteratorWordFontAttributes(ITessAPI.TessResultIterator handle, IntBuffer is_bold, IntBuffer is_italic, IntBuffer is_underlined, IntBuffer is_monospace, IntBuffer is_serif, IntBuffer is_smallcaps, IntBuffer pointsize, IntBuffer font_id) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessResultIteratorWordIsFromDictionary(ITessAPI.TessResultIterator handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessResultIteratorWordIsNumeric(ITessAPI.TessResultIterator handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessResultIteratorSymbolIsSuperscript(ITessAPI.TessResultIterator handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessResultIteratorSymbolIsSubscript(ITessAPI.TessResultIterator handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessResultIteratorSymbolIsDropcap(ITessAPI.TessResultIterator handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public ITessAPI.TessChoiceIterator TessResultIteratorGetChoiceIterator(ITessAPI.TessResultIterator handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public void TessChoiceIteratorDelete(ITessAPI.TessChoiceIterator handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public int TessChoiceIteratorNext(ITessAPI.TessChoiceIterator handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public String TessChoiceIteratorGetUTF8Text(ITessAPI.TessChoiceIterator handle) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public float TessChoiceIteratorConfidence(ITessAPI.TessChoiceIterator handle) { - throw new UnsupportedOperationException("Not supported yet."); - } -} diff --git a/Tess4J/test/net/sourceforge/tess4j/TessAPITest.java b/Tess4J/test/net/sourceforge/tess4j/TessAPITest.java deleted file mode 100644 index c6b7dfb..0000000 --- a/Tess4J/test/net/sourceforge/tess4j/TessAPITest.java +++ /dev/null @@ -1,648 +0,0 @@ -/** - * Copyright @ 2012 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package net.sourceforge.tess4j; - -import static org.junit.Assert.assertArrayEquals; - -import java.awt.image.BufferedImage; -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileReader; -import java.nio.ByteBuffer; -import java.nio.FloatBuffer; -import java.nio.IntBuffer; -import java.util.Arrays; - -import javax.imageio.ImageIO; - -import net.sourceforge.tess4j.util.ImageIOHelper; -import net.sourceforge.tess4j.util.LoggHelper; -import net.sourceforge.tess4j.util.Utils; - -import org.junit.After; -import org.junit.AfterClass; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Test; - -import com.ochafik.lang.jnaerator.runtime.NativeSize; -import com.sun.jna.NativeLong; -import com.sun.jna.Pointer; -import com.sun.jna.StringArray; -import com.sun.jna.ptr.PointerByReference; -import net.sourceforge.lept4j.Box; -import net.sourceforge.lept4j.Boxa; -import static net.sourceforge.lept4j.ILeptonica.L_CLONE; -import net.sourceforge.lept4j.Leptonica; -import net.sourceforge.lept4j.Pix; -import net.sourceforge.lept4j.util.LeptUtils; - -import net.sourceforge.tess4j.ITessAPI.*; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import static net.sourceforge.tess4j.ITessAPI.FALSE; -import static net.sourceforge.tess4j.ITessAPI.TRUE; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -public class TessAPITest { - - private static final Logger logger = LoggerFactory.getLogger(new LoggHelper().toString()); - private final String datapath = "."; - private final String testResourcesDataPath = "test/resources/test-data"; - String language = "eng"; - String expOCRResult = "The (quick) [brown] {fox} jumps!\nOver the $43,456.78 #90 dog"; - - TessAPI api; - TessBaseAPI handle; - - @BeforeClass - public static void setUpClass() throws Exception { - } - - @AfterClass - public static void tearDownClass() throws Exception { - } - - @Before - public void setUp() { - api = new TessAPIImpl().getInstance(); - handle = api.TessBaseAPICreate(); - } - - @After - public void tearDown() { - api.TessBaseAPIDelete(handle); - } - - /** - * Test of TessBaseAPIRect method, of class TessAPI. - * - * @throws java.lang.Exception - */ - @Test - public void testTessBaseAPIRect() throws Exception { - logger.info("TessBaseAPIRect"); - String expResult = expOCRResult; - File tiff = new File(testResourcesDataPath, "eurotext.tif"); - BufferedImage image = ImageIO.read(tiff); // require jai-imageio lib to read TIFF - ByteBuffer buf = ImageIOHelper.convertImageData(image); - int bpp = image.getColorModel().getPixelSize(); - int bytespp = bpp / 8; - int bytespl = (int) Math.ceil(image.getWidth() * bpp / 8.0); - api.TessBaseAPIInit3(handle, datapath, language); - api.TessBaseAPISetPageSegMode(handle, TessPageSegMode.PSM_AUTO); - Pointer utf8Text = api.TessBaseAPIRect(handle, buf, bytespp, bytespl, 90, 50, 862, 614); - String result = utf8Text.getString(0); - api.TessDeleteText(utf8Text); - logger.info(result); - assertTrue(result.startsWith(expResult)); - } - - /** - * Test of TessBaseAPIGetUTF8Text method, of class TessAPI. - * - * @throws java.lang.Exception - */ - @Test - public void testTessBaseAPIGetUTF8Text() throws Exception { - logger.info("TessBaseAPIGetUTF8Text"); - String expResult = expOCRResult; - File tiff = new File(testResourcesDataPath, "eurotext.tif"); - BufferedImage image = ImageIO.read(new FileInputStream(tiff)); // require jai-imageio lib to read TIFF - ByteBuffer buf = ImageIOHelper.convertImageData(image); - int bpp = image.getColorModel().getPixelSize(); - int bytespp = bpp / 8; - int bytespl = (int) Math.ceil(image.getWidth() * bpp / 8.0); - api.TessBaseAPIInit3(handle, datapath, language); - api.TessBaseAPISetPageSegMode(handle, TessPageSegMode.PSM_AUTO); - api.TessBaseAPISetImage(handle, buf, image.getWidth(), image.getHeight(), bytespp, bytespl); - api.TessBaseAPISetRectangle(handle, 90, 50, 862, 614); - Pointer utf8Text = api.TessBaseAPIGetUTF8Text(handle); - String result = utf8Text.getString(0); - api.TessDeleteText(utf8Text); - logger.info(result); - assertTrue(result.startsWith(expResult)); - } - - /** - * Test of TessBaseAPIGetUTF8Text method, of class TessAPI. - * - * @throws java.lang.Exception - */ - @Test - public void testTessBaseAPIGetUTF8Text_Pix() throws Exception { - logger.info("TessBaseAPIGetUTF8Text_Pix"); - String expResult = expOCRResult; - File tiff = new File(testResourcesDataPath, "eurotext.tif"); - Leptonica leptInstance = Leptonica.INSTANCE; - Pix pix = leptInstance.pixRead(tiff.getPath()); - api.TessBaseAPIInit3(handle, datapath, language); - api.TessBaseAPISetImage2(handle, pix); - Pointer utf8Text = api.TessBaseAPIGetUTF8Text(handle); - String result = utf8Text.getString(0); - api.TessDeleteText(utf8Text); - logger.info(result); - - //release Pix resource - PointerByReference pRef = new PointerByReference(); - pRef.setValue(pix.getPointer()); - leptInstance.pixDestroy(pRef); - - assertTrue(result.startsWith(expResult)); - } - - /** - * Test of TessBaseAPIGetComponentImages method, of class TessAPI. - * - * @throws java.lang.Exception - */ - @Test - public void testTessBaseAPIGetComponentImages() throws Exception { - logger.info("TessBaseAPIGetComponentImages"); - File image = new File(testResourcesDataPath, "eurotext.png"); - int expResult = 12; // number of lines in the test image - Leptonica leptInstance = Leptonica.INSTANCE; - Pix pix = leptInstance.pixRead(image.getPath()); - api.TessBaseAPIInit3(handle, datapath, language); - api.TessBaseAPISetImage2(handle, pix); - PointerByReference pixa = null; - PointerByReference blockids = null; - Boxa boxes = api.TessBaseAPIGetComponentImages(handle, TessPageIteratorLevel.RIL_TEXTLINE, TRUE, pixa, blockids); -// boxes = api.TessBaseAPIGetRegions(handle, pixa); // equivalent to TessPageIteratorLevel.RIL_BLOCK - int boxCount = leptInstance.boxaGetCount(boxes); - for (int i = 0; i < boxCount; i++) { - Box box = leptInstance.boxaGetBox(boxes, i, L_CLONE); - if (box == null) { - continue; - } - api.TessBaseAPISetRectangle(handle, box.x, box.y, box.w, box.h); - Pointer utf8Text = api.TessBaseAPIGetUTF8Text(handle); - String ocrResult = utf8Text.getString(0); - api.TessDeleteText(utf8Text); - int conf = api.TessBaseAPIMeanTextConf(handle); - System.out.print(String.format("Box[%d]: x=%d, y=%d, w=%d, h=%d, confidence: %d, text: %s", i, box.x, box.y, box.w, box.h, conf, ocrResult)); - LeptUtils.dispose(box); - } - - // release Pix and Boxa resources - LeptUtils.dispose(pix); - LeptUtils.dispose(boxes); - - assertEquals(expResult, boxCount); - } - - /** - * Test of TessVersion method, of class TessAPI. - */ - @Test - public void testTessVersion() { - logger.info("TessVersion"); - String expResult = "3.05.01"; - String result = api.TessVersion(); - logger.info(result); - assertTrue(result.startsWith(expResult)); - } - - /** - * Test of TessBaseAPIGetBoolVariable method, of class TessAPI. - */ - @Test - public void testTessBaseAPIGetBoolVariable() { - logger.info("TessBaseAPIGetBoolVariable"); - String name = "tessedit_create_hocr"; - api.TessBaseAPISetVariable(handle, name, "1"); - IntBuffer value = IntBuffer.allocate(1); - int result = -1; - if (api.TessBaseAPIGetBoolVariable(handle, "tessedit_create_hocr", value) == TRUE) { - result = value.get(0); - } - int expResult = 1; - assertEquals(expResult, result); - } - - /** - * Test of TessBaseAPIPrintVariables method, of class TessAPI. - * - * @throws java.lang.Exception - */ - @Test - public void testTessBaseAPIPrintVariablesToFile() throws Exception { - logger.info("TessBaseAPIPrintVariablesToFile"); - String var = "tessedit_char_whitelist"; - String value = "0123456789"; - api.TessBaseAPISetVariable(handle, var, value); - String filename = "printvar.txt"; - api.TessBaseAPIPrintVariablesToFile(handle, filename); // will crash if not invoked after some method - File file = new File(filename); - BufferedReader input = new BufferedReader(new FileReader(file)); - StringBuilder strB = new StringBuilder(); - String line; - String EOL = System.getProperty("line.separator"); - while ((line = input.readLine()) != null) { - strB.append(line).append(EOL); - } - input.close(); - file.delete(); - assertTrue(strB.toString().contains(var + "\t" + value)); - } - - /** - * Test of TessBaseAPIInit4 method, of class TessAPI. - */ - @Test - public void testTessBaseAPIInit4() { - logger.info("TessBaseAPIInit4"); - int oem = TessOcrEngineMode.OEM_DEFAULT; - PointerByReference configs = null; //new PointerByReference(); - int configs_size = 0; - - // disable loading dictionaries - String[] args = new String[]{"load_system_dawg", "load_freq_dawg"}; - StringArray sarray = new StringArray(args); - PointerByReference vars_vec = new PointerByReference(); - vars_vec.setPointer(sarray); - - args = new String[]{"F", "F"}; - sarray = new StringArray(args); - PointerByReference vars_values = new PointerByReference(); - vars_values.setPointer(sarray); - - NativeSize vars_vec_size = new NativeSize(args.length); - - int expResult = 0; - int result = api.TessBaseAPIInit4(handle, datapath, language, oem, configs, configs_size, vars_vec, vars_values, vars_vec_size, FALSE); - assertEquals(expResult, result); - } - - /** - * Test of TessBaseAPIGetInitLanguagesAsString method, of class TessAPI. - */ - @Test - public void testTessBaseAPIGetInitLanguagesAsString() { - logger.info("TessBaseAPIGetInitLanguagesAsString"); - String expResult = ""; - String result = api.TessBaseAPIGetInitLanguagesAsString(handle); - assertEquals(expResult, result); - } - - /** - * Test of TessBaseAPIGetLoadedLanguagesAsVector method, of class TessAPI. - */ - @Test - public void testTessBaseAPIGetLoadedLanguagesAsVector() { - logger.info("TessBaseAPIGetLoadedLanguagesAsVector"); - api.TessBaseAPIInit3(handle, datapath, language); - String[] expResult = {"eng"}; - String[] result = api.TessBaseAPIGetLoadedLanguagesAsVector(handle).getPointer().getStringArray(0); - assertArrayEquals(expResult, result); - } - - /** - * Test of TessBaseAPIGetAvailableLanguagesAsVector method, of class - * TessAPI. - */ - @Test - public void testTessBaseAPIGetAvailableLanguagesAsVector() { - logger.info("TessBaseAPIGetAvailableLanguagesAsVector"); - api.TessBaseAPIInit3(handle, datapath, language); - String[] expResult = {"eng"}; - String[] result = api.TessBaseAPIGetAvailableLanguagesAsVector(handle).getPointer().getStringArray(0); - assertTrue(Arrays.asList(result).containsAll(Arrays.asList(expResult))); - } - - /** - * Test of TessBaseAPIGetHOCRText method, of class TessAPI. - * - * @throws java.lang.Exception - */ - @Test - public void testTessBaseAPIGetHOCRText() throws Exception { - logger.info("TessBaseAPIGetHOCRText"); - File tiff = new File(testResourcesDataPath, "eurotext.tif"); - BufferedImage image = ImageIO.read(new FileInputStream(tiff)); // require jai-imageio lib to read TIFF - ByteBuffer buf = ImageIOHelper.convertImageData(image); - int bpp = image.getColorModel().getPixelSize(); - int bytespp = bpp / 8; - int bytespl = (int) Math.ceil(image.getWidth() * bpp / 8.0); - api.TessBaseAPISetPageSegMode(handle, TessPageSegMode.PSM_AUTO); - api.TessBaseAPIInit3(handle, datapath, language); - api.TessBaseAPISetImage(handle, buf, image.getWidth(), image.getHeight(), bytespp, bytespl); - int page_number = 0; - Pointer utf8Text = api.TessBaseAPIGetHOCRText(handle, page_number); - String result = utf8Text.getString(0); - api.TessDeleteText(utf8Text); - assertTrue(result.contains("
0 causes blank ouput - monitor.end_time = timeout; - ProgressMonitor pmo = new ProgressMonitor(monitor); - pmo.start(); - api.TessBaseAPIRecognize(handle, monitor); - logger.info("Message: " + pmo.getMessage()); - TessResultIterator ri = api.TessBaseAPIGetIterator(handle); - TessPageIterator pi = api.TessResultIteratorGetPageIterator(ri); - api.TessPageIteratorBegin(pi); - logger.info("Bounding boxes:\nchar(s) left top right bottom confidence font-attributes"); - int level = TessPageIteratorLevel.RIL_WORD; - - // int height = image.getHeight(); - do { - Pointer ptr = api.TessResultIteratorGetUTF8Text(ri, level); - String word = ptr.getString(0); - api.TessDeleteText(ptr); - float confidence = api.TessResultIteratorConfidence(ri, level); - IntBuffer leftB = IntBuffer.allocate(1); - IntBuffer topB = IntBuffer.allocate(1); - IntBuffer rightB = IntBuffer.allocate(1); - IntBuffer bottomB = IntBuffer.allocate(1); - api.TessPageIteratorBoundingBox(pi, level, leftB, topB, rightB, bottomB); - int left = leftB.get(); - int top = topB.get(); - int right = rightB.get(); - int bottom = bottomB.get(); - System.out.print(String.format("%s %d %d %d %d %f", word, left, top, right, bottom, confidence)); - // logger.info(String.format("%s %d %d %d %d", str, left, height - bottom, right, height - top)); // - // training box coordinates - - IntBuffer boldB = IntBuffer.allocate(1); - IntBuffer italicB = IntBuffer.allocate(1); - IntBuffer underlinedB = IntBuffer.allocate(1); - IntBuffer monospaceB = IntBuffer.allocate(1); - IntBuffer serifB = IntBuffer.allocate(1); - IntBuffer smallcapsB = IntBuffer.allocate(1); - IntBuffer pointSizeB = IntBuffer.allocate(1); - IntBuffer fontIdB = IntBuffer.allocate(1); - String fontName = api.TessResultIteratorWordFontAttributes(ri, boldB, italicB, underlinedB, monospaceB, - serifB, smallcapsB, pointSizeB, fontIdB); - boolean bold = boldB.get() == TRUE; - boolean italic = italicB.get() == TRUE; - boolean underlined = underlinedB.get() == TRUE; - boolean monospace = monospaceB.get() == TRUE; - boolean serif = serifB.get() == TRUE; - boolean smallcaps = smallcapsB.get() == TRUE; - int pointSize = pointSizeB.get(); - int fontId = fontIdB.get(); - logger.info(String.format(" font: %s, size: %d, font id: %d, bold: %b," - + " italic: %b, underlined: %b, monospace: %b, serif: %b, smallcap: %b", fontName, pointSize, - fontId, bold, italic, underlined, monospace, serif, smallcaps)); - } while (api.TessPageIteratorNext(pi, level) == TRUE); - - assertTrue(true); - } - - /** - * Test of ChoiceIterator. - * - * @throws Exception - */ - @Test - public void testChoiceIterator() throws Exception { - logger.info("TessResultIteratorGetChoiceIterator"); - File tiff = new File(testResourcesDataPath, "eurotext.tif"); - BufferedImage image = ImageIO.read(new FileInputStream(tiff)); // require jai-imageio lib to read TIFF - ByteBuffer buf = ImageIOHelper.convertImageData(image); - int bpp = image.getColorModel().getPixelSize(); - int bytespp = bpp / 8; - int bytespl = (int) Math.ceil(image.getWidth() * bpp / 8.0); - api.TessBaseAPIInit3(handle, datapath, language); - api.TessBaseAPISetImage(handle, buf, image.getWidth(), image.getHeight(), bytespp, bytespl); - api.TessBaseAPISetVariable(handle, "save_blob_choices", "T"); - api.TessBaseAPISetRectangle(handle, 37, 228, 548, 31); - ETEXT_DESC monitor = new ETEXT_DESC(); - ProgressMonitor pmo = new ProgressMonitor(monitor); - pmo.start(); - api.TessBaseAPIRecognize(handle, monitor); - logger.info("Message: " + pmo.getMessage()); - TessResultIterator ri = api.TessBaseAPIGetIterator(handle); - int level = TessPageIteratorLevel.RIL_SYMBOL; - - if (ri != null) { - do { - Pointer symbol = api.TessResultIteratorGetUTF8Text(ri, level); - float conf = api.TessResultIteratorConfidence(ri, level); - if (symbol != null) { - logger.info(String.format("symbol %s, conf: %f", symbol.getString(0), conf)); - boolean indent = false; - TessChoiceIterator ci = api.TessResultIteratorGetChoiceIterator(ri); - do { - if (indent) { - System.out.print("\t"); - } - System.out.print("\t- "); - String choice = api.TessChoiceIteratorGetUTF8Text(ci); - logger.info(String.format("%s conf: %f", choice, api.TessChoiceIteratorConfidence(ci))); - indent = true; - } while (api.TessChoiceIteratorNext(ci) == ITessAPI.TRUE); - api.TessChoiceIteratorDelete(ci); - } - logger.info("---------------------------------------------"); - api.TessDeleteText(symbol); - } while (api.TessResultIteratorNext(ri, level) == ITessAPI.TRUE); - } - - assertTrue(true); - } - - /** - * Test of ResultRenderer method, of class TessAPI. - * - * @throws java.lang.Exception - */ - @Test - public void testResultRenderer() throws Exception { - logger.info("TessResultRenderer"); - String image = String.format("%s/%s", testResourcesDataPath, "eurotext.tif"); - String output = "capi-test.txt"; - int set_only_init_params = FALSE; - int oem = TessOcrEngineMode.OEM_DEFAULT; - PointerByReference configs = null; - int configs_size = 0; - - String[] params = {"load_system_dawg", "tessedit_char_whitelist"}; - String vals[] = {"F", ""}; //0123456789-.IThisalotfpnex - PointerByReference vars_vec = new PointerByReference(); - vars_vec.setPointer(new StringArray(params)); - PointerByReference vars_values = new PointerByReference(); - vars_values.setPointer(new StringArray(vals)); - NativeSize vars_vec_size = new NativeSize(params.length); - - api.TessBaseAPISetOutputName(handle, output); - - int rc = api.TessBaseAPIInit4(handle, datapath, language, - oem, configs, configs_size, vars_vec, vars_values, vars_vec_size, set_only_init_params); - - if (rc != 0) { - api.TessBaseAPIDelete(handle); - logger.error("Could not initialize tesseract."); - return; - } - - String outputbase = "test/test-results/outputbase"; - TessResultRenderer renderer = api.TessHOcrRendererCreate(outputbase); - api.TessResultRendererInsert(renderer, api.TessBoxTextRendererCreate(outputbase)); - api.TessResultRendererInsert(renderer, api.TessTextRendererCreate(outputbase)); - String dataPath = api.TessBaseAPIGetDatapath(handle); - api.TessResultRendererInsert(renderer, api.TessPDFRendererCreate(outputbase, dataPath)); - int result = api.TessBaseAPIProcessPages(handle, image, null, 0, renderer); - - if (result == FALSE) { - logger.error("Error during processing."); - return; - } - - for (; renderer != null; renderer = api.TessResultRendererNext(renderer)) { - String ext = api.TessResultRendererExtention(renderer).getString(0); - logger.info(String.format("TessResultRendererExtention: %s\nTessResultRendererTitle: %s\nTessResultRendererImageNum: %d", - ext, - api.TessResultRendererTitle(renderer).getString(0), - api.TessResultRendererImageNum(renderer))); - } - - api.TessDeleteResultRenderer(renderer); - assertTrue(new File(outputbase + ".pdf").exists()); - } -} diff --git a/Tess4J/test/net/sourceforge/tess4j/Tesseract1Test.java b/Tess4J/test/net/sourceforge/tess4j/Tesseract1Test.java deleted file mode 100644 index 7799551..0000000 --- a/Tess4J/test/net/sourceforge/tess4j/Tesseract1Test.java +++ /dev/null @@ -1,267 +0,0 @@ -/** - * Copyright @ 2010 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package net.sourceforge.tess4j; - -import java.awt.Rectangle; -import java.awt.image.BufferedImage; -import java.io.File; -import java.util.ArrayList; -import java.util.List; -import java.util.Arrays; - -import javax.imageio.IIOImage; -import javax.imageio.ImageIO; - -import net.sourceforge.tess4j.util.LoggHelper; -import net.sourceforge.tess4j.util.Utils; -import net.sourceforge.tess4j.util.ImageHelper; -import net.sourceforge.tess4j.util.ImageIOHelper; -import net.sourceforge.tess4j.ITesseract.RenderedFormat; -import net.sourceforge.tess4j.ITessAPI.TessPageIteratorLevel; - -import com.recognition.software.jdeskew.ImageDeskew; - -import org.junit.After; -import org.junit.AfterClass; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Test; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import static org.junit.Assert.assertArrayEquals; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -public class Tesseract1Test { - - private static final Logger logger = LoggerFactory.getLogger(new LoggHelper().toString()); - static final double MINIMUM_DESKEW_THRESHOLD = 0.05d; - ITesseract instance; - - private final String datapath = "."; - private final String testResourcesDataPath = "test/resources/test-data"; - private final String expOCRResult = "The (quick) [brown] {fox} jumps!\nOver the $43,456.78 #90 dog"; - - @BeforeClass - public static void setUpClass() throws Exception { - } - - @AfterClass - public static void tearDownClass() throws Exception { - } - - @Before - public void setUp() { - instance = new Tesseract1(); - instance.setDatapath(new File(datapath).getPath()); - } - - @After - public void tearDown() { - } - - /** - * Test of doOCR method, of class Tesseract1. - * - * @throws java.lang.Exception - */ - @Test - public void testDoOCR_File() throws Exception { - logger.info("doOCR on a PNG image"); - File imageFile = new File(testResourcesDataPath, "eurotext.png"); - String expResult = expOCRResult; - String result = instance.doOCR(imageFile); - logger.info(result); - assertEquals(expResult, result.substring(0, expResult.length())); - } - - /** - * Test of doOCR method, of class Tesseract. - * - * @throws java.lang.Exception - */ - @Test - public void testDoOCR_UNLV_Zone_File() throws Exception { - logger.info("doOCR on a PNG image with UNLV zone file .uzn"); - //UNLV zone format: left top width height label - File imageFile = new File(testResourcesDataPath, "eurotext_unlv.png"); - String expResult = "& duck/goose, as 12.5% of E-mail\n\n" - + "from aspammer@website.com is spam.\n\n" - + "The (quick) [brown] {fox} jumps!\n" - + "Over the $43,456.78 #90 dog"; - String result = instance.doOCR(imageFile); - logger.info(result); - assertEquals(expResult, result.trim()); - } - - /** - * Test of doOCR method, of class Tesseract. - * - * @throws java.lang.Exception - */ - @Test - public void testDoOCR_File_With_Configs() throws Exception { - logger.info("doOCR with configs"); - File imageFile = new File(testResourcesDataPath, "eurotext.png"); - String expResult = "[-0123456789.\n ]+"; - List configs = Arrays.asList("digits"); - instance.setConfigs(configs); - String result = instance.doOCR(imageFile); - logger.info(result); - assertTrue(result.matches(expResult)); - } - - /** - * Test of doOCR method, of class Tesseract1. - * - * @throws java.lang.Exception - */ - @Test - public void testDoOCR_File_Rectangle() throws Exception { - logger.info("doOCR on a BMP image with bounding rectangle"); - File imageFile = new File(testResourcesDataPath, "eurotext.bmp"); - Rectangle rect = new Rectangle(0, 0, 1024, 800); // define an equal or smaller region of interest on the image - String expResult = expOCRResult; - String result = instance.doOCR(imageFile, rect); - logger.info(result); - assertEquals(expResult, result.substring(0, expResult.length())); - } - - /** - * Test of doOCR method, of class Tesseract1. - * - * @throws java.lang.Exception - */ - @Test - public void testDoOCR_PDF() throws Exception { - logger.info("doOCR on a PDF document"); - File imageFile = new File(testResourcesDataPath, "eurotext.pdf"); - List imageList = ImageIOHelper.getIIOImageList(imageFile); - String expResult = expOCRResult; - String result = instance.doOCR(imageList, null); - logger.info(result); - assertEquals(expResult, result.substring(0, expResult.length())); - } - - /** - * Test of doOCR method, of class Tesseract1. - * - * @throws java.lang.Exception - */ - @Test - public void testDoOCR_BufferedImage() throws Exception { - logger.info("doOCR on a buffered image of a PNG"); - File imageFile = new File(testResourcesDataPath, "eurotext.png"); - BufferedImage bi = ImageIO.read(imageFile); - String expResult = expOCRResult; - String result = instance.doOCR(bi); - logger.info(result); - assertEquals(expResult, result.substring(0, expResult.length())); - } - - /** - * Test of deskew algorithm. - * - * @throws java.lang.Exception - */ - @Test - public void testDoOCR_SkewedImage() throws Exception { - logger.info("doOCR on a skewed PNG image"); - File imageFile = new File(testResourcesDataPath, "eurotext_deskew.png"); - BufferedImage bi = ImageIO.read(imageFile); - ImageDeskew id = new ImageDeskew(bi); - double imageSkewAngle = id.getSkewAngle(); // determine skew angle - if ((imageSkewAngle > MINIMUM_DESKEW_THRESHOLD || imageSkewAngle < -(MINIMUM_DESKEW_THRESHOLD))) { - bi = ImageHelper.rotateImage(bi, -imageSkewAngle); // deskew image - } - - String expResult = expOCRResult; - String result = instance.doOCR(bi); - logger.info(result); - assertEquals(expResult, result.substring(0, expResult.length())); - } - - /** - * Test of createDocuments method, of class Tesseract. - * - * @throws java.lang.Exception - */ - @Test - public void testCreateDocuments() throws Exception { - logger.info("createDocuments for an image"); - File imageFile1 = new File(testResourcesDataPath, "eurotext.pdf"); - File imageFile2 = new File(testResourcesDataPath, "eurotext.png"); - String outputbase1 = "test/test-results/docrenderer1-1"; - String outputbase2 = "test/test-results/docrenderer1-2"; - List formats = new ArrayList(Arrays.asList(RenderedFormat.HOCR, RenderedFormat.PDF, RenderedFormat.TEXT)); - instance.createDocuments(new String[]{imageFile1.getPath(), imageFile2.getPath()}, new String[]{outputbase1, outputbase2}, formats); - assertTrue(new File(outputbase1 + ".pdf").exists()); - } - - /** - * Test of getWords method, of class Tesseract1. - * - * @throws java.lang.Exception - */ - @Test - public void testGetWords() throws Exception { - logger.info("getWords"); - File imageFile = new File(testResourcesDataPath, "eurotext.tif"); - - String expResult = "The (quick) [brown] {fox} jumps!\nOver the $43,456.78 #90 dog"; - String[] expResults = expResult.split("\\s"); - - int pageIteratorLevel = TessPageIteratorLevel.RIL_WORD; - logger.info("PageIteratorLevel: " + Utils.getConstantName(pageIteratorLevel, TessPageIteratorLevel.class)); - BufferedImage bi = ImageIO.read(imageFile); - List result = instance.getWords(bi, pageIteratorLevel); - - // print the complete result - for (Word word : result) { - logger.info(word.toString()); - } - - List text = new ArrayList(); - for (Word word : result.subList(0, expResults.length)) { - text.add(word.getText()); - } - - assertArrayEquals(expResults, text.toArray()); - } - - /** - * Test of getSegmentedRegions method, of class Tesseract1. - * - * @throws java.lang.Exception - */ - @Test - public void testGetSegmentedRegions() throws Exception { - logger.info("getSegmentedRegions at given TessPageIteratorLevel"); - File imageFile = new File(testResourcesDataPath, "eurotext.png"); - BufferedImage bi = ImageIO.read(imageFile); - int level = TessPageIteratorLevel.RIL_SYMBOL; - logger.info("PageIteratorLevel: " + Utils.getConstantName(level, TessPageIteratorLevel.class)); - List result = instance.getSegmentedRegions(bi, level); - for (int i = 0; i < result.size(); i++) { - Rectangle rect = result.get(i); - logger.info(String.format("Box[%d]: x=%d, y=%d, w=%d, h=%d", i, rect.x, rect.y, rect.width, rect.height)); - } - - assertTrue(result.size() > 0); - } -} diff --git a/Tess4J/test/net/sourceforge/tess4j/TesseractTest.java b/Tess4J/test/net/sourceforge/tess4j/TesseractTest.java deleted file mode 100644 index 9184732..0000000 --- a/Tess4J/test/net/sourceforge/tess4j/TesseractTest.java +++ /dev/null @@ -1,267 +0,0 @@ -/** - * Copyright @ 2010 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package net.sourceforge.tess4j; - -import java.awt.Rectangle; -import java.awt.image.BufferedImage; -import java.io.File; -import java.util.List; -import java.util.ArrayList; -import java.util.Arrays; - -import javax.imageio.IIOImage; -import javax.imageio.ImageIO; - -import net.sourceforge.tess4j.util.ImageHelper; -import net.sourceforge.tess4j.util.ImageIOHelper; -import net.sourceforge.tess4j.util.LoggHelper; -import net.sourceforge.tess4j.util.Utils; - -import net.sourceforge.tess4j.ITesseract.RenderedFormat; -import net.sourceforge.tess4j.ITessAPI.TessPageIteratorLevel; - -import static org.junit.Assert.*; - -import com.recognition.software.jdeskew.ImageDeskew; - -import org.junit.After; -import org.junit.AfterClass; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Test; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class TesseractTest { - - private static final Logger logger = LoggerFactory.getLogger(new LoggHelper().toString()); - static final double MINIMUM_DESKEW_THRESHOLD = 0.05d; - ITesseract instance; - - private final String datapath = "."; - private final String testResourcesDataPath = "test/resources/test-data"; - private final String expOCRResult = "The (quick) [brown] {fox} jumps!\nOver the $43,456.78 #90 dog"; - - @BeforeClass - public static void setUpClass() throws Exception { - } - - @AfterClass - public static void tearDownClass() throws Exception { - } - - @Before - public void setUp() { - instance = new Tesseract(); - instance.setDatapath(new File(datapath).getPath()); - } - - @After - public void tearDown() { - } - - /** - * Test of doOCR method, of class Tesseract. - * - * @throws java.lang.Exception - */ - @Test - public void testDoOCR_File() throws Exception { - logger.info("doOCR on a PNG image"); - File imageFile = new File(testResourcesDataPath, "eurotext.png"); - String expResult = expOCRResult; - String result = instance.doOCR(imageFile); - logger.info(result); - assertEquals(expResult, result.substring(0, expResult.length())); - } - - /** - * Test of doOCR method, of class Tesseract. - * - * @throws java.lang.Exception - */ - @Test - public void testDoOCR_UNLV_Zone_File() throws Exception { - logger.info("doOCR on a PNG image with UNLV zone file .uzn"); - //UNLV zone format: left top width height label - File imageFile = new File(testResourcesDataPath, "eurotext_unlv.png"); - String expResult = "& duck/goose, as 12.5% of E-mail\n\n" - + "from aspammer@website.com is spam.\n\n" - + "The (quick) [brown] {fox} jumps!\n" - + "Over the $43,456.78 #90 dog"; - String result = instance.doOCR(imageFile); - logger.info(result); - assertEquals(expResult, result.trim()); - } - - /** - * Test of doOCR method, of class Tesseract. - * - * @throws java.lang.Exception - */ - @Test - public void testDoOCR_File_With_Configs() throws Exception { - logger.info("doOCR with configs"); - File imageFile = new File(testResourcesDataPath, "eurotext.png"); - String expResult = "[-0123456789.\n ]+"; - List configs = Arrays.asList("digits"); - instance.setConfigs(configs); - String result = instance.doOCR(imageFile); - logger.info(result); - assertTrue(result.matches(expResult)); - instance.setConfigs(null); // since Tesseract instance is a singleton, clear configs so the effects do not carry on into subsequent runs. - } - - /** - * Test of doOCR method, of class Tesseract. - * - * @throws java.lang.Exception - */ - @Test - public void testDoOCR_File_Rectangle() throws Exception { - logger.info("doOCR on a BMP image with bounding rectangle"); - File imageFile = new File(testResourcesDataPath, "eurotext.bmp"); - Rectangle rect = new Rectangle(0, 0, 1024, 800); // define an equal or smaller region of interest on the image - String expResult = expOCRResult; - String result = instance.doOCR(imageFile, rect); - logger.info(result); - assertEquals(expResult, result.substring(0, expResult.length())); - } - - /** - * Test of doOCR method, of class Tesseract. - * - * @throws java.lang.Exception - */ - @Test - public void testDoOCR_PDF() throws Exception { - logger.info("doOCR on a PDF document"); - File imageFile = new File(testResourcesDataPath, "eurotext.pdf"); - List imageList = ImageIOHelper.getIIOImageList(imageFile); - String expResult = expOCRResult; - String result = instance.doOCR(imageList, null); - logger.info(result); - assertEquals(expResult, result.substring(0, expResult.length())); - } - - /** - * Test of doOCR method, of class Tesseract. - * - * @throws java.lang.Exception - */ - @Test - public void testDoOCR_BufferedImage() throws Exception { - logger.info("doOCR on a buffered image of a PNG"); - File imageFile = new File(testResourcesDataPath, "eurotext.png"); - BufferedImage bi = ImageIO.read(imageFile); - String expResult = expOCRResult; - String result = instance.doOCR(bi); - logger.info(result); - assertEquals(expResult, result.substring(0, expResult.length())); - } - - /** - * Test of deskew algorithm. - * - * @throws java.lang.Exception - */ - @Test - public void testDoOCR_SkewedImage() throws Exception { - logger.info("doOCR on a skewed PNG image"); - File imageFile = new File(testResourcesDataPath, "eurotext_deskew.png"); - BufferedImage bi = ImageIO.read(imageFile); - ImageDeskew id = new ImageDeskew(bi); - double imageSkewAngle = id.getSkewAngle(); // determine skew angle - if ((imageSkewAngle > MINIMUM_DESKEW_THRESHOLD || imageSkewAngle < -(MINIMUM_DESKEW_THRESHOLD))) { - bi = ImageHelper.rotateImage(bi, -imageSkewAngle); // deskew image - } - - String expResult = expOCRResult; - String result = instance.doOCR(bi); - logger.info(result); - assertEquals(expResult, result.substring(0, expResult.length())); - } - - /** - * Test of createDocuments method, of class Tesseract. - * - * @throws java.lang.Exception - */ - @Test - public void testCreateDocuments() throws Exception { - logger.info("createDocuments for multiple images"); - File imageFile1 = new File(testResourcesDataPath, "eurotext.pdf"); - File imageFile2 = new File(testResourcesDataPath, "eurotext.png"); - String outputbase1 = "test/test-results/docrenderer-1"; - String outputbase2 = "test/test-results/docrenderer-2"; - List formats = new ArrayList(Arrays.asList(RenderedFormat.HOCR, RenderedFormat.PDF, RenderedFormat.TEXT)); - instance.createDocuments(new String[]{imageFile1.getPath(), imageFile2.getPath()}, new String[]{outputbase1, outputbase2}, formats); - assertTrue(new File(outputbase1 + ".pdf").exists()); - } - - /** - * Test of getWords method, of class Tesseract. - * - * @throws java.lang.Exception - */ - @Test - public void testGetWords() throws Exception { - logger.info("getWords"); - File imageFile = new File(testResourcesDataPath, "eurotext.tif"); - - String expResult = "The (quick) [brown] {fox} jumps!\nOver the $43,456.78 #90 dog"; - String[] expResults = expResult.split("\\s"); - - int pageIteratorLevel = TessPageIteratorLevel.RIL_WORD; - logger.info("PageIteratorLevel: " + Utils.getConstantName(pageIteratorLevel, TessPageIteratorLevel.class)); - BufferedImage bi = ImageIO.read(imageFile); - List result = instance.getWords(bi, pageIteratorLevel); - - //print the complete result - for (Word word : result) { - logger.info(word.toString()); - } - - List text = new ArrayList(); - for (Word word : result.subList(0, expResults.length)) { - text.add(word.getText()); - } - - assertArrayEquals(expResults, text.toArray()); - } - - /** - * Test of getSegmentedRegions method, of class Tesseract. - * - * @throws java.lang.Exception - */ - @Test - public void testGetSegmentedRegions() throws Exception { - logger.info("getSegmentedRegions at given TessPageIteratorLevel"); - File imageFile = new File(testResourcesDataPath, "eurotext.png"); - BufferedImage bi = ImageIO.read(imageFile); - int level = TessPageIteratorLevel.RIL_SYMBOL; - logger.info("PageIteratorLevel: " + Utils.getConstantName(level, TessPageIteratorLevel.class)); - List result = instance.getSegmentedRegions(bi, level); - for (int i = 0; i < result.size(); i++) { - Rectangle rect = result.get(i); - logger.info(String.format("Box[%d]: x=%d, y=%d, w=%d, h=%d", i, rect.x, rect.y, rect.width, rect.height)); - } - - assertTrue(result.size() > 0); - } -} diff --git a/Tess4J/test/net/sourceforge/tess4j/TestFolderExtraction.java b/Tess4J/test/net/sourceforge/tess4j/TestFolderExtraction.java deleted file mode 100644 index 46a88fa..0000000 --- a/Tess4J/test/net/sourceforge/tess4j/TestFolderExtraction.java +++ /dev/null @@ -1,84 +0,0 @@ -/** - * Copyright @ 2008 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ - -package net.sourceforge.tess4j; - -import static org.junit.Assert.assertTrue; - -import java.io.File; -import java.net.URISyntaxException; -import java.net.URL; - - -import net.sourceforge.tess4j.util.LoadLibs; - -import net.sourceforge.tess4j.util.LoggHelper; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class TestFolderExtraction { - - private static final Logger logger = LoggerFactory.getLogger(new LoggHelper().toString()); - - @Test - public void testFolderExtraction() { - - File tessDataFolder = null; - - try { - - /** - * Loads the image from resources. - */ - String filename = String.format("%s/%s", "/test-data", "eurotext.pdf"); - URL defaultImage = getClass().getResource(filename); - File imageFile = new File(defaultImage.toURI()); - - /** - * Extracts tessdata folder into a temp folder. - */ - logger.info("Loading the tessdata folder into a temporary folder."); - tessDataFolder = LoadLibs.extractTessResources("tessdata"); - - /** - * Gets tesseract instance and sets data path. - */ - ITesseract instance = new Tesseract(); - - if (tessDataFolder != null) { - logger.info(tessDataFolder.getAbsolutePath()); - instance.setDatapath(tessDataFolder.getParent()); - } - - /** - * Performs OCR on the image. - */ - String result = instance.doOCR(imageFile); - logger.info(result); - - } catch (TesseractException e) { - logger.error(e.getMessage()); - logger.error(e.getMessage(), e); - } catch (URISyntaxException e) { - logger.error(e.getMessage(), e); - } - - // checks if tessdata folder exists - assertTrue(tessDataFolder != null && tessDataFolder.exists()); - } - -} diff --git a/Tess4J/test/net/sourceforge/tess4j/util/PdfUtilitiesTest.java b/Tess4J/test/net/sourceforge/tess4j/util/PdfUtilitiesTest.java deleted file mode 100644 index ff738ef..0000000 --- a/Tess4J/test/net/sourceforge/tess4j/util/PdfUtilitiesTest.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright 2014 Quan Nguyen. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package net.sourceforge.tess4j.util; - -import java.io.File; -import java.io.IOException; -import org.junit.After; -import org.junit.AfterClass; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Test; -import static org.junit.Assert.*; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class PdfUtilitiesTest { - - private static final Logger logger = LoggerFactory.getLogger(new LoggHelper().toString()); - private final String TEST_RESOURCES_DATA_PATH = "test/resources/test-data"; - - @BeforeClass - public static void setUpClass() { - } - - @AfterClass - public static void tearDownClass() { - } - - @Before - public void setUp() { - System.setProperty(PdfUtilities.PDF_LIBRARY, PdfUtilities.PDFBOX); // Note: comment out to test Ghostscript - } - - @After - public void tearDown() { - } - - /** - * Test of convertPdf2Tiff method, of class PdfUtilities. - * - * @throws java.lang.Exception - */ - @Test - public void testConvertPdf2Tiff() throws Exception { - logger.info("convertPdf2Tiff"); - File inputPdfFile = new File(TEST_RESOURCES_DATA_PATH, "eurotext.pdf"); - File result = PdfUtilities.convertPdf2Tiff(inputPdfFile); - result.deleteOnExit(); - assertTrue(result.exists()); - } - - /** - * Test of convertPdf2Png method, of class PdfUtilities. - * - * @throws java.io.IOException - */ - @Test - public void testConvertPdf2Png() throws IOException { - logger.info("convertPdf2Png"); - File inputPdfFile = new File(TEST_RESOURCES_DATA_PATH, "eurotext.pdf"); - File[] results = PdfUtilities.convertPdf2Png(inputPdfFile); - assertTrue(results.length > 0); - - //clean up - File parentDir = results[0].getParentFile(); - for (File result : results) { - result.delete(); - } - parentDir.delete(); - } - - /** - * Test of splitPdf method, of class PdfUtilities. - */ - @Test - public void testSplitPdf() { - logger.info("splitPdf"); - File inputPdfFile = new File(TEST_RESOURCES_DATA_PATH, "multipage-pdf.pdf"); - File outputPdfFile = new File("test/test-results/multipage-pdf_splitted.pdf"); - int startPage = 2; - int endPage = 3; - int expResult = 2; - PdfUtilities.splitPdf(inputPdfFile, outputPdfFile, startPage, endPage); - int pageCount = PdfUtilities.getPdfPageCount(outputPdfFile); - assertEquals(expResult, pageCount); - } - - /** - * Test of getPdfPageCount method, of class PdfUtilities. - */ - @Test - public void testGetPdfPageCount() { - logger.info("getPdfPageCount"); - File inputPdfFile = new File(TEST_RESOURCES_DATA_PATH, "multipage-pdf.pdf"); - int expResult = 5; - int result = PdfUtilities.getPdfPageCount(inputPdfFile); - assertEquals(expResult, result); - } - - /** - * Test of mergePdf method, of class PdfUtilities. - */ - @Test - public void testMergePdf() { - logger.info("mergePdf"); - File pdfPartOne = new File(TEST_RESOURCES_DATA_PATH, "eurotext.pdf"); - File pdfPartTwo = new File(TEST_RESOURCES_DATA_PATH, "multipage-pdf.pdf"); - int expResult = 6; - File outputPdfFile = new File("test/test-results", "multipage-pdf_merged.pdf"); - File[] inputPdfFiles = {pdfPartOne, pdfPartTwo}; - PdfUtilities.mergePdf(inputPdfFiles, outputPdfFile); - assertEquals(expResult, PdfUtilities.getPdfPageCount(outputPdfFile)); - } - -} diff --git a/Tess4J/test/resources/test-data/eurotext.bmp b/Tess4J/test/resources/test-data/eurotext.bmp deleted file mode 100644 index be05080..0000000 Binary files a/Tess4J/test/resources/test-data/eurotext.bmp and /dev/null differ diff --git a/Tess4J/test/resources/test-data/eurotext.pdf b/Tess4J/test/resources/test-data/eurotext.pdf deleted file mode 100644 index eac4388..0000000 Binary files a/Tess4J/test/resources/test-data/eurotext.pdf and /dev/null differ diff --git a/Tess4J/test/resources/test-data/eurotext.png b/Tess4J/test/resources/test-data/eurotext.png deleted file mode 100644 index e5c324e..0000000 Binary files a/Tess4J/test/resources/test-data/eurotext.png and /dev/null differ diff --git a/Tess4J/test/resources/test-data/eurotext.tif b/Tess4J/test/resources/test-data/eurotext.tif deleted file mode 100644 index 92791da..0000000 Binary files a/Tess4J/test/resources/test-data/eurotext.tif and /dev/null differ diff --git a/Tess4J/test/resources/test-data/eurotext_deskew.png b/Tess4J/test/resources/test-data/eurotext_deskew.png deleted file mode 100644 index dfa06bd..0000000 Binary files a/Tess4J/test/resources/test-data/eurotext_deskew.png and /dev/null differ diff --git a/Tess4J/test/resources/test-data/eurotext_unlv.png b/Tess4J/test/resources/test-data/eurotext_unlv.png deleted file mode 100644 index e5c324e..0000000 Binary files a/Tess4J/test/resources/test-data/eurotext_unlv.png and /dev/null differ diff --git a/Tess4J/test/resources/test-data/eurotext_unlv.uzn b/Tess4J/test/resources/test-data/eurotext_unlv.uzn deleted file mode 100644 index 878c8de..0000000 --- a/Tess4J/test/resources/test-data/eurotext_unlv.uzn +++ /dev/null @@ -1,3 +0,0 @@ -97 162 747 50 ThirdLine -97 209 828 55 FourthLine -92 56 810 107 First2Lines \ No newline at end of file diff --git a/Tess4J/test/resources/test-data/multipage-pdf.pdf b/Tess4J/test/resources/test-data/multipage-pdf.pdf deleted file mode 100644 index fa1eade..0000000 Binary files a/Tess4J/test/resources/test-data/multipage-pdf.pdf and /dev/null differ diff --git a/Tess4J/versionchanges.txt b/Tess4J/versionchanges.txt deleted file mode 100644 index 503e438..0000000 --- a/Tess4J/versionchanges.txt +++ /dev/null @@ -1,137 +0,0 @@ -Tess4J Change Summary - -Version 0.1 - initial release (14 Aug 2010): -- Java JNA-based wrapper for Tesseract OCR DLL 2.04 -- Support uncompressed, binary TIFF images - -Version 0.2 (16 Aug 2010): -- Add support for more image formats (PNG, BMP, GIF, PDF, JPEG) -- Add support for compressed, grayscale and colored images - -Version 0.3 (22 Aug 2010): -- Include API support for BufferedImage -- Clean up codes. Remove unsupported API and files -- Document the API - -Version 0.3.1 (26 Aug 2010): -- Send only pixel data, not whole image data, to Tesseract engine, to fix a bug that has erroneously put some words at beginning of line towards end of line - -Version 0.4 (1 Nov 2010): -- Add JNA Direct Mapping calls, which can provide performance near that of custom JNI - -Version 1.0 (30 October 2012): -- Upgrade to Tesseract 3.02 (r798), which is not backward compatible with Tesseract 2.04. -- Implement a new JNA wrapper for the new Tesseract OCR API -- Add more unit test cases -- Update documentation - -Version 1.1 (3 March 2013) -- Update Tesseract DLL to r828 -- Additional API methods, image helper methods, and unit test cases -- Improve handling of Unicode character encoding -- Fix memory leaks -- Add support for determining skew angle and image rotation - -Version 1.2 (22 September 2013) -- Update Tesseract DLL to r866 -- More efficient OCR of multiple images -- Various minor improvements -- Update JNA to v4.0 - -Version 1.3 (31 May 2014) -- Update JNA to v4.1.0 -- Update Ghost4J to v0.5.1 -- Refactoring -- Bundle Tesseract and Leptonica 64-bit DLLs - -Version 1.4 (18 January 2015) -- Refactor to reduce code duplication -- Embed Windows native resources in JAR -- Autoload Windows native libraries - -Version 1.4.1 (24 January 2015) -- Enable use of jna.library.path system property for user-customizable path - -Version 1.5 (13 March 2015) -- Add UNLV zone file support -- Refactor - -Version 2.0 (29 March 2015) -- Upgrade to Tesseract 3.03 (r1050), which is compatible with Tesseract 3.03RC on Linux -- Refactor Tesseract class for extensibility and thread-safety -- Update English language data for Tesseract 3.02 - -Version 3.0 (25 December 2015) -- Upgrade to Tesseract 3.04 (953523b) -- Include Lept4J library -- Incorporate slf4j and logback libraries for logging -- Make GhostScript calls thread safe - -Version 3.1 (21 March 2016) -- Update Tesseract to 3.04.01 (4ef68a0) -- Use Lept4J-1.1.2 (Leptonica 1.72) -- Update JNA to 4.2.2 -- Update Ghost4J to 1.0.1 -- Delete ResultRenderer after use to release PDF file handler - -Version 3.2 (15 May 2016) -- Revert JNA to 4.1.0 due to "Invalid calling convention 63" errors invoking GhostScript via Ghost4J on Linux -- Update Lept4J to 1.2.2 (Leptonica 1.73) -- Recompile Tesseract 3.04.01 DLL against Leptonica 1.73 -- Update GhostScript Windows binary to 9.19 - -Version 3.2.1 (29 May 2016) -- Properly release Box and Boxa resources -- Update Lept4J to 1.2.3 - -Version 3.2.2 (16 February 2017) -- Update GhostScript to 9.20 -- Fix possible NPE with PDF-related codes -- Update dependencies -- Additional image utility methods - -Version 3.3.0 (16 February 2017) -- Upgrade to Tesseract 3.05 (2ca5d0a) -- Update Lept4J to 1.3.0 (Leptonica 1.74.1) - -Version 3.3.1 (23 March 2017) -- Update Lept4J to 1.3.1 -- Update other dependencies - -Version 3.4.0 (1 June 2017) -- Upgrade to Tesseract 3.05.01 (2158661) -- Update Lept4J to 1.4.0 -- Add support for jboss-vfs protocol - -Version 3.4.1 (22 September 2017) -- Not extract/copy native resource if it exists and has same file size -- Update Tesseract 3.05.01 (e2e79c4); link against Leptonica 1.74.4 -- Update Lept4J to 1.6.1 - -Version 3.4.2 (14 November 2017) -- Update Lept4J to 1.6.2 -- Update GhostScript to 9.22 -- Improve handling of PDF files in multi-threaded environment -- Lift limits on number of pages in PDF -- Use TESSDATA_PREFIX environment variable by default, if defined - -Version 3.4.3 (14 January 2018) -- Not extract/copy resource if it exists and has same file size - -Version 3.4.4 (22 February 2018) -- Exclude logback.xml from JAR -- Add image rotate and deskew methods -- Update Lept4J to 1.6.3 - -Version 3.4.5 (21 March 2018) -- Remove GS DLL due to license incompatibility -- Use PDFBox - -Version 3.4.6 (25 March 2018) -- Update PDFBox dependencies - -Version 3.4.7 (16 April 2018) -- Update jai-imageio-core to 1.4.0 for Java 9 fixes - -Version 3.4.8 (2 May 2018) -- Fix a path issue when extracting resources from JAR to temp directory on Windows server \ No newline at end of file