Skip to content

Commit 6a4cc5b

Browse files
committed
check clang19
1 parent 7d202f6 commit 6a4cc5b

File tree

24 files changed

+855
-562
lines changed

24 files changed

+855
-562
lines changed

.github/workflows/clang19.yaml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
on:
2+
push:
3+
branches: [main, master]
4+
pull_request:
5+
branches: [main, master]
6+
7+
jobs:
8+
check:
9+
runs-on: ubuntu-latest
10+
container:
11+
image: ghcr.io/r-hub/containers/clang19:latest
12+
13+
steps:
14+
- uses: actions/checkout@v4
15+
16+
- name: Install dependencies
17+
run: |
18+
R -q -e 'pak::pkg_install(c("deps::.", "any::rcmdcheck"), dependencies = TRUE)'
19+
20+
- uses: r-lib/actions/check-r-package@v2

CRAN-SUBMISSION

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
Version: 5.3.4
2-
Date: 2024-12-17 21:04:41 UTC
3-
SHA: 97f5b5e9d7320a878ddbbd128dc52d8adc40ca0a
1+
Version: 5.3.5
2+
Date: 2024-12-21 03:07:36 UTC
3+
SHA: 7d202f614b340caca7fec216dee6b254bb434552

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Package: cpp11tesseract
22
Type: Package
33
Title: Open Source OCR Engine
4-
Version: 5.3.4
4+
Version: 5.3.5
55
Authors@R: c(person("Mauricio", "Vargas Sepulveda",
66
role = c("aut", "cre"),
77
email = "m.sepulveda@mail.utoronto.ca",

R/ocr.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
#' @rdname ocr
2626
#' @references [Tesseract: Improving Quality](https://github.com/tesseract-ocr/tesseract/wiki/ImproveQuality)
2727
#' @examples
28-
#' file <- system.file("examples", "wilde.png", package = "cpp11tesseract")
28+
#' file <- system.file("examples", "test.png", package = "cpp11tesseract")
2929
#' text <- ocr(file)
3030
#' cat(text)
3131
ocr <- function(file, engine = tesseract("eng"), HOCR = FALSE, opw = "", upw = "") {

R/tessdata.R

Lines changed: 60 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,52 @@
11
#' Tesseract Training Data
22
#'
33
#' Helper function to download training data from the official
4-
#' [tessdata](https://tesseract-ocr.github.io/tessdoc/Data-Files) repository. On Linux, the fast training data can be installed directly with
4+
#' [tessdata](https://tesseract-ocr.github.io/tessdoc/Data-Files) repository.
5+
#' On Linux, the fast training data can be installed directly with
56
#' [yum](https://src.fedoraproject.org/rpms/tesseract) or
67
#' [apt-get](https://packages.debian.org/search?suite=stable&section=all&arch=any&searchon=names&keywords=tesseract-ocr-).
78
#'
89
#' Tesseract uses training data to perform OCR. Most systems default to English
9-
#' training data. To improve OCR performance for other languages you can to install the
10-
#' training data from your distribution. For example to install the spanish training data:
10+
#' training data. To improve OCR performance for other languages you can to
11+
#' install the training data from your distribution. For example to install the
12+
#' spanish training data:
1113
#'
12-
#' - [tesseract-ocr-spa](https://packages.debian.org/testing/tesseract-ocr-spa) (Debian, Ubuntu)
14+
#' - [tesseract-ocr-spa](https://packages.debian.org/testing/tesseract-ocr-spa)
15+
#' (Debian, Ubuntu)
1316
#' - `tesseract-langpack-spa` (Fedora, EPEL)
1417
#'
15-
#' On Windows and MacOS you can install languages using the [tesseract_download] function
16-
#' which downloads training data directly from [github](https://github.com/tesseract-ocr/tessdata)
18+
#' On Windows and MacOS you can install languages using the [tesseract_download]
19+
#' function which downloads training data directly from
20+
#' [github](https://github.com/tesseract-ocr/tessdata)
1721
#' and stores it in a the path on disk given by the `TESSDATA_PREFIX` variable.
1822
#'
1923
#' @export
2024
#' @return no return value, called for side effects
2125
#' @aliases tessdata
2226
#' @rdname tessdata
2327
#' @family tesseract
24-
#' @param lang three letter code for language, see [tessdata](https://github.com/tesseract-ocr/tessdata) repository.
28+
#' @param lang three letter code for language, see
29+
#' [tessdata](https://github.com/tesseract-ocr/tessdata) repository.
30+
#' @param model either `fast` or `best` is currently supported. The latter
31+
#' downloads more accurate (but slower) trained models for Tesseract 4.0 or
32+
#' higher
2533
#' @param datapath destination directory where to download store the file
26-
#' @param model either `fast` or `best` is currently supported. The latter downloads
27-
#' more accurate (but slower) trained models for Tesseract 4.0 or higher
2834
#' @param progress print progress while downloading
29-
#' @references [tesseract wiki: training data](https://tesseract-ocr.github.io/tessdoc/Data-Files)
35+
#' @references
36+
#' [tesseract wiki: training data](https://tesseract-ocr.github.io/tessdoc/Data-Files)
3037
#' @examples
3138
#' # download the french training data
39+
#' # this is wrapped around a \donttest{} block because otherwise the clang19
40+
#' # CRAN check will fail with a "> 5 seconds" message
3241
#' \donttest{
33-
#' tesseract_download("fra", model = "best", datapath = tempdir())
42+
#' dir <- tempdir()
43+
#' tesseract_download("fra", model = "best", datapath = dir)
44+
#' file <- system.file("examples", "french.png", package = "cpp11tesseract")
45+
#' text <- ocr(file, engine = tesseract("fra", datapath = dir))
46+
#' cat(text)
3447
#' }
35-
#'
36-
#' if (any("fra" %in% tesseract_info()$available)) {
37-
#' french <- tesseract("fra")
38-
#' file <- system.file("examples", "french.png", package = "cpp11tesseract")
39-
#' text <- ocr(file, engine = french)
40-
#' cat(text)
41-
#' }
42-
tesseract_download <- function(lang, datapath = NULL, model = c("fast", "best"), progress = interactive()) {
48+
tesseract_download <- function(lang, model = c("fast", "best"),
49+
datapath = NULL, progress = interactive()) {
4350
stopifnot(is.character(lang))
4451
model <- match.arg(model)
4552
if (!length(datapath)) {
@@ -56,44 +63,50 @@ tesseract_download <- function(lang, datapath = NULL, model = c("fast", "best"),
5663
repo <- paste0("tessdata_", model)
5764
release <- "4.1.0"
5865
}
59-
url <- sprintf("https://github.com/tesseract-ocr/%s/raw/%s/%s.traineddata", repo, release, lang)
66+
url <- sprintf("https://github.com/tesseract-ocr/%s/raw/%s/%s.traineddata",
67+
repo, release, lang)
6068
download_helper(url, datapath, progress)
6169
}
6270

6371
#' Tesseract Contributed Training Data
6472
#'
6573
#' Helper function to download training data from the contributed
66-
#' [tessdata_contrib](https://github.com/tesseract-ocr/tessdata_contrib) repository.
74+
#' [tessdata_contrib](https://github.com/tesseract-ocr/tessdata_contrib)
75+
#' repository.
6776
#'
6877
#' @export
6978
#' @return no return value, called for side effects
7079
#' @aliases tessdata
7180
#' @rdname tessdata
7281
#' @family tesseract
7382
#' @seealso [tesseract_download]
74-
#' @param lang three letter code for language, see [tessdata](https://github.com/tesseract-ocr/tessdata) repository.
83+
#' @param lang three letter code for language, see
84+
#' [tessdata](https://github.com/tesseract-ocr/tessdata) repository.
85+
#' @param model either `fast` or `best` is currently supported. The latter
86+
#' downloads more accurate (but slower) trained models for Tesseract 4.0 or
87+
#' higher
7588
#' @param datapath destination directory where to download store the file
76-
#' @param model either `fast` or `best` is currently supported. The latter downloads
77-
#' more accurate (but slower) trained models for Tesseract 4.0 or higher
7889
#' @param progress print progress while downloading
79-
#' @references [tesseract wiki: training data](https://tesseract-ocr.github.io/tessdoc/Data-Files)
90+
#' @references
91+
#' [tesseract wiki: training data](https://tesseract-ocr.github.io/tessdoc/Data-Files)
8092
#' @examples
81-
#' # download the polytonic greek training data
93+
#' # download the greek training data
94+
#' # this is wrapped around a \donttest{} block because otherwise the clang19
95+
#' # CRAN check will fail with a "> 5 seconds" message
8296
#' \donttest{
83-
#' tesseract_contributed_download("grc_hist", model = "best",
84-
#' datapath = tempdir())
85-
#' }
86-
#'
87-
#' if (any("grc_hist" %in% tesseract_info()$available)) {
88-
#' greek <- tesseract("grc_hist")
89-
#' file <- system.file("examples", "polytonicgreek.png", package = "cpp11tesseract")
90-
#' text <- ocr(file, engine = greek)
91-
#' cat(text)
97+
#' dir <- tempdir()
98+
#' tesseract_contributed_download("grc_hist", model = "best", datapath = dir)
99+
#' file <- system.file("examples", "polytonicgreek.png",
100+
#' package = "cpp11tesseract")
101+
#' text <- ocr(file, engine = tesseract("grc_hist", datapath = dir))
102+
#' cat(text)
92103
#' }
93-
tesseract_contributed_download <- function(lang, datapath = NULL, model = c("fast", "best"), progress = interactive()) {
104+
tesseract_contributed_download <- function(lang, model = c("fast", "best"),
105+
datapath = NULL, progress = interactive()) {
94106
stopifnot(is.character(lang))
95107
if (!any(lang %in% c("grc_hist", "akk"))) {
96-
stop("The only available contributed models are Akkadian and Polytonic Greek (for now).", call. = FALSE)
108+
stop(paste("The only available contributed models are Akkadian and",
109+
"Polytonic Greek (for now)."), call. = FALSE)
97110
}
98111
model <- match.arg(model)
99112
if (!length(datapath)) {
@@ -104,12 +117,14 @@ tesseract_contributed_download <- function(lang, datapath = NULL, model = c("fas
104117
version <- tesseract_version_major()
105118

106119
if (lang == "grc_hist" && version < 4) {
107-
stop("The Polytonic Greek model is only available for Tesseract 4.0 or higher.", call. = FALSE)
120+
stop(paste("The Polytonic Greek model is only available for Tesseract 4.0",
121+
"or higher."), call. = FALSE)
108122
}
109123

110124
if (lang == "grc_hist") {
111125
if (model == "fast") {
112-
warning("The Polytonic Greek model is only available in 'best' quality.", call. = FALSE)
126+
warning(paste("The Polytonic Greek model is only available in 'best'",
127+
"quality."), call. = FALSE)
113128
}
114129
release <- "grc_hist/best"
115130
}
@@ -122,7 +137,8 @@ tesseract_contributed_download <- function(lang, datapath = NULL, model = c("fas
122137
release <- "akk/fast"
123138
}
124139

125-
url <- sprintf("https://github.com/tesseract-ocr/tessdata_contrib/raw/main/%s/%s.traineddata", release, lang)
140+
url <- sprintf(paste0("https://github.com/tesseract-ocr/tessdata_contrib/",
141+
"raw/main/%s/%s.traineddata"), release, lang)
126142
print(url)
127143

128144
download_helper(url, datapath, progress)
@@ -166,7 +182,9 @@ progress_fun <- function(down, up) {
166182
}
167183

168184
warn_on_linux <- function() {
169-
if (identical(.Platform$OS.type, "unix") && !identical(Sys.info()[["sysname"]], "Darwin")) {
170-
warning("On Linux you should install training data via yum/apt. Please check the manual page.", call. = FALSE)
185+
if (identical(.Platform$OS.type, "unix") &&
186+
!identical(Sys.info()[["sysname"]], "Darwin")) {
187+
warning("On Linux you should install training data via yum/apt. Please
188+
check the manual page.", call. = FALSE)
171189
}
172190
}

README.Rmd

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ How to extract text from an image:
3838

3939
```{r}
4040
library(cpp11tesseract)
41-
text <- ocr("inst/examples/wilde.jpg")
41+
text <- ocr("inst/examples/wilde.png")
4242
cat(text)
4343
```
4444

@@ -118,3 +118,12 @@ Alternatively you can manually download training data from [github](https://gith
118118
and store it in a path on disk that you pass in the `datapath` parameter or set a default path via the
119119
`TESSDATA_PREFIX` environment variable. Note that the Tesseract 4 and Tesseract 3 use different
120120
training data format. Make sure to download training data from the branch that matches your libtesseract version.
121+
122+
## Testing with docker (development)
123+
124+
```
125+
mkdir check
126+
docker run -v `pwd`/check:/check ghcr.io/r-hub/containers/clang19:latest apt install apt-utils libcurl4-openssl-dev &\
127+
R -q -e "install.packages(c('Rcpp', 'jsonlite', 'curl', 'httr', 'yaml', 'rex', 'digest', 'crayon', 'withr', 'cli', 'magick', 'processx', 'tibble', 'V8', 'testthat', 'mockery', 'whoami', 'covr', 'asciicast'), repos = 'https://cloud.r-project.org')" &\
128+
r-check
129+
```

README.md

Lines changed: 13 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -31,41 +31,20 @@ How to extract text from an image:
3131

3232
``` r
3333
library(cpp11tesseract)
34-
text <- ocr("inst/examples/wilde.jpg")
34+
text <- ocr("inst/examples/wilde.png")
3535
cat(text)
36-
#> Act One
37-
#> [The living room of Algernon Moncrieff's flat in Mayfair, London.
38-
#> Lane is arranging afternoon tea on a table. Algemion enters}
39-
#> Algernon: Lane, have you made the cucumber sandwiches for
40-
#> Lady Bracknell’s tea?
41-
#> Lane: Yes, sir. [Handing them to Algernon on a silver tray]
42-
#> Algernon: [Looking carefully at them, taking two and sitting down
43-
#> on the sofa] Oh, by the way’, Lane, I looked at your notebook. |
44-
#> noticed that when Lord Shoreman and Mr Worthing dined with
45-
#> me on Thursday night, eight bottles of champagne were drunk,
46-
#> Lane: Yes, sir; eight bottles.
47-
#> Algernon: Why is it that, in a bachelor’s home, the servants
48-
#> always drink the champagne? I just ask because | am interested,
49-
#> Lane.
50-
#> Lane: I think that it is because the champagne is better in a
51-
#> bachelor’s home. | have noticed that the champagne in married
52-
#> people's homes is rarely very good.
53-
#> Algernon: Good heavens*! Is marriage so depressing?
54-
#> Lane: | believe marriage is very pleasant, sir. | haven't had much
55-
#> experience of it myself. [ have only been married once, and that
56-
#> was because of a misunderstanding” between myself and a young
57-
#> person.
58-
#> Algernon: [Lazily, without interest] I am not very interested in
59-
#> your family life, Lane.
60-
#> Lane: No, sir; it is not a very interesting subject. I never think
61-
#> of it myself.
62-
#> Algernon: That is very understandable. Well, thank you, Lane.
63-
#> [Lane goes off]
64-
#> Algernon: [To himself] Lane’s views on marriage seem very casual.
65-
#> Really, if the servants don’t set us a good example, what on earth
66-
#> is the use of them? They seem to have no morals
67-
#> [Lane enters]
68-
#> Lane: Mr Ernest Worthing is here, sir.
36+
#> Complete Works
37+
#> oF
38+
#> OSCAR WILDE
39+
#> EDITED BY
40+
#>
41+
#> ROBERT ROSS
42+
#> MISCELLANIES
43+
#> ‘AUTHORIZED EDITION
44+
#>
45+
#> THE WYMAN-FOGG COMPANY
46+
#>
47+
#> BOSTON :: MASSACHUSETTS
6948
```
7049

7150
## Differences with the original tesseract R package

configure

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ PKG_RPM_NAME="tesseract-devel leptonica-devel"
1313
PKG_BREW_NAME="tesseract"
1414
PKG_TEST_HEADER_TESSERACT="<tesseract/baseapi.h>"
1515
PKG_CFLAGS="-I/usr/include/tesseract -I/usr/include/leptonica"
16-
PKG_LIBS="-ltesseract"
16+
PKG_LIBS="-ltesseract -larchive -lcurl"
1717

1818
# Use pkg-config if available
1919
pkg-config --version >/dev/null 2>&1
@@ -23,6 +23,8 @@ if [ $? -eq 0 ]; then
2323
fi
2424

2525
# Debugging information
26+
echo "CC: $CC"
27+
echo "CXX: $CXX"
2628
echo "PKGCONFIG_CFLAGS_TESSERACT: $PKGCONFIG_CFLAGS_TESSERACT"
2729
echo "PKGCONFIG_LIBS_TESSERACT: $PKGCONFIG_LIBS_TESSERACT"
2830

@@ -37,6 +39,11 @@ elif [ "$PKGCONFIG_CFLAGS_TESSERACT" ] || [ "$PKGCONFIG_LIBS_TESSERACT" ]; then
3739
PKG_LIBS="${PKGCONFIG_LIBS_TESSERACT}"
3840
fi
3941

42+
# Check if the compiler is clang
43+
if [ "$CC" = "clang" ] || [ "$CXX" = "clang++" ]; then
44+
PKG_LIBS="$PKG_LIBS -stdlib=libc++"
45+
fi
46+
4047
# For debugging
4148
echo "Using PKG_CFLAGS=$PKG_CFLAGS"
4249
echo "Using PKG_LIBS=$PKG_LIBS"

0 commit comments

Comments
 (0)