Skip to content

Commit 5655604

Browse files
committed
add cpp11poppler to suggests
1 parent 593a287 commit 5655604

File tree

9 files changed

+13
-177
lines changed

9 files changed

+13
-177
lines changed

DESCRIPTION

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,10 @@ Suggests:
3939
knitr,
4040
tibble,
4141
rmarkdown,
42-
testthat (>= 3.0.0)
42+
testthat (>= 3.0.0),
43+
cpp11poppler
44+
Remotes:
45+
pachadotdev/cpp11poppler
4346
Encoding: UTF-8
4447
VignetteBuilder: knitr
4548
Language: en-US

R/cpp11.R

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -47,15 +47,3 @@ ocr_raw_data <- function(input, ptr) {
4747
ocr_file_data <- function(file, ptr) {
4848
.Call(`_cpp11tesseract_ocr_file_data`, file, ptr)
4949
}
50-
51-
n_pages <- function(file_path, opw, upw) {
52-
.Call(`_cpp11tesseract_n_pages`, file_path, opw, upw)
53-
}
54-
55-
get_poppler_config <- function() {
56-
.Call(`_cpp11tesseract_get_poppler_config`)
57-
}
58-
59-
poppler_convert <- function(file_path, format, pages, names, dpi, opw, upw, antialiasing, text_antialiasing) {
60-
.Call(`_cpp11tesseract_poppler_convert`, file_path, format, pages, names, dpi, opw, upw, antialiasing, text_antialiasing)
61-
}

R/images.R

Lines changed: 0 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,46 +1,7 @@
1-
is.pdf <- function(x) {
2-
grepl("\\.pdf$", tolower(x))
3-
}
4-
51
is.tiff <- function(x) {
62
grepl("\\.tiff$", tolower(x))
73
}
84

9-
pdf_convert <- function(pdf, format = "png", pages = NULL, dpi = 72,
10-
antialias = TRUE, opw = "", upw = "") {
11-
config <- get_poppler_config()
12-
13-
if (isFALSE(config$render) || isFALSE(length(config$format) > 0)) {
14-
stop("You version of libppoppler does not support rendering")
15-
}
16-
17-
format <- match.arg(format, config$format)
18-
19-
if (is.null(pages)) {
20-
pages <- seq_len(n_pages(file, opw = opw, upw = upw))
21-
}
22-
23-
if (isFALSE(is.numeric(pages)) || isFALSE(length(pages) > 0)) {
24-
stop("Argument 'pages' must be a one-indexed vector of page numbers")
25-
}
26-
27-
antialiasing <- isTRUE(antialias) || isTRUE(antialias == "draw")
28-
29-
text_antialiasing <- isTRUE(antialias) || isTRUE(antialias == "text")
30-
31-
dout <- tempdir()
32-
suppressWarnings(try(dir.create(dout)))
33-
34-
filenames <- file.path(dout, sprintf("%s-%03d.%s",
35-
# basename(pdf),
36-
# remove the file extension
37-
tools::file_path_sans_ext(basename(pdf)),
38-
pages, format)
39-
)
40-
41-
poppler_convert(pdf, format, pages, filenames, dpi, opw, upw, antialiasing, text_antialiasing)
42-
}
43-
445
tiff_convert <- function(tiff, format = "png", dpi = 72) {
456
stopifnot(requireNamespace("magick", quietly = TRUE))
467
magick::image_convert(magick::image_read(tiff), format = format)

R/ocr.R

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -41,21 +41,14 @@ ocr <- function(file, engine = tesseract("eng"), HOCR = FALSE, opw = "", upw = "
4141
magick::image_write(x, tmp, format = "PNG", density = "300x300")
4242
ocr(tmp, engine = engine, HOCR = HOCR)
4343
}, character(1))
44-
} else if (isTRUE(is.character(file)) && isFALSE(is.pdf(file))) {
44+
} else if (isTRUE(is.character(file))) {
4545
if (isFALSE(is.tiff(file))) {
4646
vapply(file, ocr_file, character(1), ptr = engine, HOCR = HOCR, USE.NAMES = FALSE)
4747
} else {
4848
ocr(tiff_convert(file), engine, HOCR = HOCR)
4949
}
5050
} else if (isTRUE(is.raw(file))) {
5151
ocr_raw(file, engine, HOCR = HOCR)
52-
} else if (isTRUE(is.pdf(file))) {
53-
n <- n_pages(file, opw = opw, upw = upw)
54-
fout <- pdf_convert(file, format = "png", pages = 1:n, opw = opw, upw = upw)
55-
out <- vapply(fout, function(x) ocr(x, engine = engine, HOCR = HOCR), character(1))
56-
unlink(fout)
57-
names(out) <- NULL
58-
out
5952
} else {
6053
stop("Argument 'file' must be file-path, url or raw vector")
6154
}

src/cpp11.cpp

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -89,39 +89,15 @@ extern "C" SEXP _cpp11tesseract_ocr_file_data(SEXP file, SEXP ptr) {
8989
return cpp11::as_sexp(ocr_file_data(cpp11::as_cpp<cpp11::decay_t<const std::string &>>(file), cpp11::as_cpp<cpp11::decay_t<TessPtr>>(ptr)));
9090
END_CPP11
9191
}
92-
// cpp11tesseract.cpp
93-
int n_pages(const std::string & file_path, const std::string & opw, const std::string & upw);
94-
extern "C" SEXP _cpp11tesseract_n_pages(SEXP file_path, SEXP opw, SEXP upw) {
95-
BEGIN_CPP11
96-
return cpp11::as_sexp(n_pages(cpp11::as_cpp<cpp11::decay_t<const std::string &>>(file_path), cpp11::as_cpp<cpp11::decay_t<const std::string &>>(opw), cpp11::as_cpp<cpp11::decay_t<const std::string &>>(upw)));
97-
END_CPP11
98-
}
99-
// cpp11tesseract.cpp
100-
list get_poppler_config();
101-
extern "C" SEXP _cpp11tesseract_get_poppler_config() {
102-
BEGIN_CPP11
103-
return cpp11::as_sexp(get_poppler_config());
104-
END_CPP11
105-
}
106-
// cpp11tesseract.cpp
107-
std::vector<std::string> poppler_convert(const std::string & file_path, const std::string & format, const std::vector<int> & pages, const std::vector<std::string> & names, const double & dpi, const std::string & opw, const std::string & upw, const bool & antialiasing, const bool & text_antialiasing);
108-
extern "C" SEXP _cpp11tesseract_poppler_convert(SEXP file_path, SEXP format, SEXP pages, SEXP names, SEXP dpi, SEXP opw, SEXP upw, SEXP antialiasing, SEXP text_antialiasing) {
109-
BEGIN_CPP11
110-
return cpp11::as_sexp(poppler_convert(cpp11::as_cpp<cpp11::decay_t<const std::string &>>(file_path), cpp11::as_cpp<cpp11::decay_t<const std::string &>>(format), cpp11::as_cpp<cpp11::decay_t<const std::vector<int> &>>(pages), cpp11::as_cpp<cpp11::decay_t<const std::vector<std::string> &>>(names), cpp11::as_cpp<cpp11::decay_t<const double &>>(dpi), cpp11::as_cpp<cpp11::decay_t<const std::string &>>(opw), cpp11::as_cpp<cpp11::decay_t<const std::string &>>(upw), cpp11::as_cpp<cpp11::decay_t<const bool &>>(antialiasing), cpp11::as_cpp<cpp11::decay_t<const bool &>>(text_antialiasing)));
111-
END_CPP11
112-
}
11392

11493
extern "C" {
11594
static const R_CallMethodDef CallEntries[] = {
11695
{"_cpp11tesseract_engine_info_internal", (DL_FUNC) &_cpp11tesseract_engine_info_internal, 1},
11796
{"_cpp11tesseract_get_param_values", (DL_FUNC) &_cpp11tesseract_get_param_values, 2},
118-
{"_cpp11tesseract_get_poppler_config", (DL_FUNC) &_cpp11tesseract_get_poppler_config, 0},
119-
{"_cpp11tesseract_n_pages", (DL_FUNC) &_cpp11tesseract_n_pages, 3},
12097
{"_cpp11tesseract_ocr_file", (DL_FUNC) &_cpp11tesseract_ocr_file, 3},
12198
{"_cpp11tesseract_ocr_file_data", (DL_FUNC) &_cpp11tesseract_ocr_file_data, 2},
12299
{"_cpp11tesseract_ocr_raw", (DL_FUNC) &_cpp11tesseract_ocr_raw, 3},
123100
{"_cpp11tesseract_ocr_raw_data", (DL_FUNC) &_cpp11tesseract_ocr_raw_data, 2},
124-
{"_cpp11tesseract_poppler_convert", (DL_FUNC) &_cpp11tesseract_poppler_convert, 9},
125101
{"_cpp11tesseract_print_params", (DL_FUNC) &_cpp11tesseract_print_params, 1},
126102
{"_cpp11tesseract_tesseract_config", (DL_FUNC) &_cpp11tesseract_tesseract_config, 0},
127103
{"_cpp11tesseract_tesseract_engine_internal", (DL_FUNC) &_cpp11tesseract_tesseract_engine_internal, 5},

src/cpp11tesseract.cpp

Lines changed: 0 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,6 @@
77
#define GenericVector std::vector
88
#endif
99

10-
#include <poppler-document.h>
11-
#include <poppler-page.h>
12-
#include <poppler-image.h>
13-
#include <poppler-page-renderer.h>
14-
1510
#include <memory>
1611
#include <list>
1712
#include <string>
@@ -297,58 +292,3 @@ data_frame ocr_data_internal(tesseract::TessBaseAPI *api, Pix *image) {
297292
if (!image) throw std::runtime_error("Failed to read image");
298293
return ocr_data_internal(api, image);
299294
}
300-
301-
[[cpp11::register]] int n_pages(const std::string &file_path,
302-
const std::string &opw,
303-
const std::string &upw) {
304-
auto doc = poppler::document::load_from_file(file_path, opw, upw);
305-
306-
if (!doc) {
307-
throw std::runtime_error("PDF parsing failure.");
308-
}
309-
310-
if (doc->is_locked()) {
311-
throw std::runtime_error("PDF file is locked. Invalid password?");
312-
}
313-
314-
return doc->pages();
315-
}
316-
317-
[[cpp11::register]] list get_poppler_config() {
318-
bool render_feature = poppler::page_renderer::can_render();
319-
std::vector<std::string> formats = poppler::image::supported_image_formats();
320-
writable::strings formats2(formats.size());
321-
for (size_t i = 0; i < formats.size(); ++i) {
322-
formats2[i] = formats[i];
323-
}
324-
return writable::list({
325-
"render"_nm = render_feature,
326-
"format"_nm = formats2
327-
});
328-
}
329-
330-
[[cpp11::register]] std::vector<std::string> poppler_convert(
331-
const std::string &file_path,
332-
const std::string &format, const std::vector<int> &pages,
333-
const std::vector<std::string> &names, const double &dpi,
334-
const std::string & opw, const std::string &upw,
335-
const bool &antialiasing, const bool &text_antialiasing) {
336-
auto doc = poppler::document::load_from_file(file_path, opw, upw);
337-
for(size_t i = 0; i < pages.size(); i++){
338-
int pagenum = pages[i];
339-
std::string filename = names[i];
340-
std::unique_ptr<poppler::page> p(doc->create_page(pagenum - 1));
341-
if(!p)
342-
throw std::runtime_error("Invalid page.");
343-
poppler::page_renderer pr;
344-
pr.set_render_hint(poppler::page_renderer::antialiasing, antialiasing);
345-
pr.set_render_hint(poppler::page_renderer::text_antialiasing,
346-
text_antialiasing);
347-
poppler::image img = pr.render_page(p.get(), dpi, dpi);
348-
if(!img.is_valid())
349-
throw std::runtime_error("PDF rendering failure.");
350-
if(!img.save(filename, format, dpi))
351-
throw std::runtime_error("Failed to save file" + filename);
352-
}
353-
return names;
354-
}

tests/testthat/test-download.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
test_that("tesseract_download works", {
2+
skip_on_cran()
23
expect_no_error(tesseract_download("ron", datapath = tempdir(), model = "fast"))
34
expect_error(tesseract_download("rou", datapath = tempdir(), model = "fast"))
45
})
56

67
test_that("tesseract_contributed_download works", {
7-
8+
skip_on_cran()
89
expect_warning(tesseract_contributed_download("grc_hist", datapath = tempdir(), model = "fast"))
910

10-
# if OS is Linux
1111
if (identical(.Platform$OS.type, "unix") && !identical(Sys.info()[["sysname"]], "Darwin")) {
1212
expect_warning(warn_on_linux())
1313
}

tests/testthat/test-multipage.R

Lines changed: 0 additions & 13 deletions
This file was deleted.

vignettes/intro.Rmd

Lines changed: 6 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -121,30 +121,18 @@ text <- input %>%
121121
cat(text)
122122
```
123123

124-
## Read from scanned PDF files and multipage TIFF files
124+
## Read from PDF files
125125

126-
If your images are stored in PDF files they first need to be converted to a proper image format. Use a high DPI to keep quality of the image.
126+
If your images are stored in PDF files they first need to be converted to a proper image format. We can do this in R using the `pdf_convert` function from the pdftools package. Use a high DPI to keep quality of the image.
127127

128-
```{r, eval=require(magick)}
128+
```{r, eval=require(cpp11poppler)}
129+
library(cpp11poppler)
129130
file <- system.file("examples", "ocrscan.pdf", package = "cpp11tesseract")
130-
text <- ocr(file)
131+
pngfile <- pdf_convert(file, dpi = 600)
132+
text <- ocr(pngfile)
131133
cat(text)
132134
```
133135

134-
The `ocr()` function can also extract text from multipage scanned PDF and TIFF files.
135-
The function will return a vector with the text found in each page. This is a good example
136-
that OCR is not perfect.
137-
138-
```{r}
139-
file1 <- system.file("examples", "bondargentina.pdf", package = "cpp11tesseract")
140-
ocr1 <- ocr(file1, engine = eng)
141-
cat(ocr1)
142-
143-
file2 <- system.file("examples", "bondargentina.tiff", package = "cpp11tesseract")
144-
ocr2 <- ocr(file2, engine = eng)
145-
cat(ocr2)
146-
```
147-
148136
## Tesseract Control Parameters
149137

150138
Tesseract supports hundreds of "control parameters" which alter the OCR engine. Use `tesseract_params()` to list all parameters with their default value and a brief description. It also has a handy `filter` argument to quickly find parameters that match a particular string.

0 commit comments

Comments
 (0)