11# ' Tesseract Training Data
22# '
33# ' Helper function to download training data from the official
4- # ' [tessdata](https://tesseract-ocr.github.io/tessdoc/Data-Files) repository. On Linux, the fast training data can be installed directly with
4+ # ' [tessdata](https://tesseract-ocr.github.io/tessdoc/Data-Files) repository.
5+ # ' On Linux, the fast training data can be installed directly with
56# ' [yum](https://src.fedoraproject.org/rpms/tesseract) or
67# ' [apt-get](https://packages.debian.org/search?suite=stable§ion=all&arch=any&searchon=names&keywords=tesseract-ocr-).
78# '
89# ' Tesseract uses training data to perform OCR. Most systems default to English
9- # ' training data. To improve OCR performance for other languages you can to install the
10- # ' training data from your distribution. For example to install the spanish training data:
10+ # ' training data. To improve OCR performance for other languages you can to
11+ # ' install the training data from your distribution. For example to install the
12+ # ' spanish training data:
1113# '
12- # ' - [tesseract-ocr-spa](https://packages.debian.org/testing/tesseract-ocr-spa) (Debian, Ubuntu)
14+ # ' - [tesseract-ocr-spa](https://packages.debian.org/testing/tesseract-ocr-spa)
15+ # ' (Debian, Ubuntu)
1316# ' - `tesseract-langpack-spa` (Fedora, EPEL)
1417# '
15- # ' On Windows and MacOS you can install languages using the [tesseract_download] function
16- # ' which downloads training data directly from [github](https://github.com/tesseract-ocr/tessdata)
18+ # ' On Windows and MacOS you can install languages using the [tesseract_download]
19+ # ' function which downloads training data directly from
20+ # ' [github](https://github.com/tesseract-ocr/tessdata)
1721# ' and stores it in a the path on disk given by the `TESSDATA_PREFIX` variable.
1822# '
1923# ' @export
2024# ' @return no return value, called for side effects
2125# ' @aliases tessdata
2226# ' @rdname tessdata
2327# ' @family tesseract
24- # ' @param lang three letter code for language, see [tessdata](https://github.com/tesseract-ocr/tessdata) repository.
28+ # ' @param lang three letter code for language, see
29+ # ' [tessdata](https://github.com/tesseract-ocr/tessdata) repository.
30+ # ' @param model either `fast` or `best` is currently supported. The latter
31+ # ' downloads more accurate (but slower) trained models for Tesseract 4.0 or
32+ # ' higher
2533# ' @param datapath destination directory where to download store the file
26- # ' @param model either `fast` or `best` is currently supported. The latter downloads
27- # ' more accurate (but slower) trained models for Tesseract 4.0 or higher
2834# ' @param progress print progress while downloading
29- # ' @references [tesseract wiki: training data](https://tesseract-ocr.github.io/tessdoc/Data-Files)
35+ # ' @references
36+ # ' [tesseract wiki: training data](https://tesseract-ocr.github.io/tessdoc/Data-Files)
3037# ' @examples
3138# ' # download the french training data
39+ # ' # this is wrapped around a \donttest{} block because otherwise the clang19
40+ # ' # CRAN check will fail with a "> 5 seconds" message
3241# ' \donttest{
33- # ' tesseract_download("fra", model = "best", datapath = tempdir())
42+ # ' dir <- tempdir()
43+ # ' tesseract_download("fra", model = "best", datapath = dir)
44+ # ' file <- system.file("examples", "french.png", package = "cpp11tesseract")
45+ # ' text <- ocr(file, engine = tesseract("fra", datapath = dir))
46+ # ' cat(text)
3447# ' }
35- # '
36- # ' if (any("fra" %in% tesseract_info()$available)) {
37- # ' french <- tesseract("fra")
38- # ' file <- system.file("examples", "french.png", package = "cpp11tesseract")
39- # ' text <- ocr(file, engine = french)
40- # ' cat(text)
41- # ' }
42- tesseract_download <- function (lang , datapath = NULL , model = c(" fast" , " best" ), progress = interactive()) {
48+ tesseract_download <- function (lang , model = c(" fast" , " best" ),
49+ datapath = NULL , progress = interactive()) {
4350 stopifnot(is.character(lang ))
4451 model <- match.arg(model )
4552 if (! length(datapath )) {
@@ -56,44 +63,50 @@ tesseract_download <- function(lang, datapath = NULL, model = c("fast", "best"),
5663 repo <- paste0(" tessdata_" , model )
5764 release <- " 4.1.0"
5865 }
59- url <- sprintf(" https://github.com/tesseract-ocr/%s/raw/%s/%s.traineddata" , repo , release , lang )
66+ url <- sprintf(" https://github.com/tesseract-ocr/%s/raw/%s/%s.traineddata" ,
67+ repo , release , lang )
6068 download_helper(url , datapath , progress )
6169}
6270
6371# ' Tesseract Contributed Training Data
6472# '
6573# ' Helper function to download training data from the contributed
66- # ' [tessdata_contrib](https://github.com/tesseract-ocr/tessdata_contrib) repository.
74+ # ' [tessdata_contrib](https://github.com/tesseract-ocr/tessdata_contrib)
75+ # ' repository.
6776# '
6877# ' @export
6978# ' @return no return value, called for side effects
7079# ' @aliases tessdata
7180# ' @rdname tessdata
7281# ' @family tesseract
7382# ' @seealso [tesseract_download]
74- # ' @param lang three letter code for language, see [tessdata](https://github.com/tesseract-ocr/tessdata) repository.
83+ # ' @param lang three letter code for language, see
84+ # ' [tessdata](https://github.com/tesseract-ocr/tessdata) repository.
85+ # ' @param model either `fast` or `best` is currently supported. The latter
86+ # ' downloads more accurate (but slower) trained models for Tesseract 4.0 or
87+ # ' higher
7588# ' @param datapath destination directory where to download store the file
76- # ' @param model either `fast` or `best` is currently supported. The latter downloads
77- # ' more accurate (but slower) trained models for Tesseract 4.0 or higher
7889# ' @param progress print progress while downloading
79- # ' @references [tesseract wiki: training data](https://tesseract-ocr.github.io/tessdoc/Data-Files)
90+ # ' @references
91+ # ' [tesseract wiki: training data](https://tesseract-ocr.github.io/tessdoc/Data-Files)
8092# ' @examples
81- # ' # download the polytonic greek training data
93+ # ' # download the greek training data
94+ # ' # this is wrapped around a \donttest{} block because otherwise the clang19
95+ # ' # CRAN check will fail with a "> 5 seconds" message
8296# ' \donttest{
83- # ' tesseract_contributed_download("grc_hist", model = "best",
84- # ' datapath = tempdir())
85- # ' }
86- # '
87- # ' if (any("grc_hist" %in% tesseract_info()$available)) {
88- # ' greek <- tesseract("grc_hist")
89- # ' file <- system.file("examples", "polytonicgreek.png", package = "cpp11tesseract")
90- # ' text <- ocr(file, engine = greek)
91- # ' cat(text)
97+ # ' dir <- tempdir()
98+ # ' tesseract_contributed_download("grc_hist", model = "best", datapath = dir)
99+ # ' file <- system.file("examples", "polytonicgreek.png",
100+ # ' package = "cpp11tesseract")
101+ # ' text <- ocr(file, engine = tesseract("grc_hist", datapath = dir))
102+ # ' cat(text)
92103# ' }
93- tesseract_contributed_download <- function (lang , datapath = NULL , model = c(" fast" , " best" ), progress = interactive()) {
104+ tesseract_contributed_download <- function (lang , model = c(" fast" , " best" ),
105+ datapath = NULL , progress = interactive()) {
94106 stopifnot(is.character(lang ))
95107 if (! any(lang %in% c(" grc_hist" , " akk" ))) {
96- stop(" The only available contributed models are Akkadian and Polytonic Greek (for now)." , call. = FALSE )
108+ stop(paste(" The only available contributed models are Akkadian and" ,
109+ " Polytonic Greek (for now)." ), call. = FALSE )
97110 }
98111 model <- match.arg(model )
99112 if (! length(datapath )) {
@@ -104,12 +117,14 @@ tesseract_contributed_download <- function(lang, datapath = NULL, model = c("fas
104117 version <- tesseract_version_major()
105118
106119 if (lang == " grc_hist" && version < 4 ) {
107- stop(" The Polytonic Greek model is only available for Tesseract 4.0 or higher." , call. = FALSE )
120+ stop(paste(" The Polytonic Greek model is only available for Tesseract 4.0" ,
121+ " or higher." ), call. = FALSE )
108122 }
109123
110124 if (lang == " grc_hist" ) {
111125 if (model == " fast" ) {
112- warning(" The Polytonic Greek model is only available in 'best' quality." , call. = FALSE )
126+ warning(paste(" The Polytonic Greek model is only available in 'best'" ,
127+ " quality." ), call. = FALSE )
113128 }
114129 release <- " grc_hist/best"
115130 }
@@ -122,7 +137,8 @@ tesseract_contributed_download <- function(lang, datapath = NULL, model = c("fas
122137 release <- " akk/fast"
123138 }
124139
125- url <- sprintf(" https://github.com/tesseract-ocr/tessdata_contrib/raw/main/%s/%s.traineddata" , release , lang )
140+ url <- sprintf(paste0(" https://github.com/tesseract-ocr/tessdata_contrib/" ,
141+ " raw/main/%s/%s.traineddata" ), release , lang )
126142 print(url )
127143
128144 download_helper(url , datapath , progress )
@@ -166,7 +182,9 @@ progress_fun <- function(down, up) {
166182}
167183
168184warn_on_linux <- function () {
169- if (identical(.Platform $ OS.type , " unix" ) && ! identical(Sys.info()[[" sysname" ]], " Darwin" )) {
170- warning(" On Linux you should install training data via yum/apt. Please check the manual page." , call. = FALSE )
185+ if (identical(.Platform $ OS.type , " unix" ) &&
186+ ! identical(Sys.info()[[" sysname" ]], " Darwin" )) {
187+ warning(" On Linux you should install training data via yum/apt. Please
188+ check the manual page." , call. = FALSE )
171189 }
172190}
0 commit comments