Skip to content

Commit bdbef45

Browse files
authored
Merge pull request #897 from Islandora/hocr
Add hOCR option to Text Extraction Media Attachment action and IIIF Manifest
2 parents 0948436 + 2e47801 commit bdbef45

File tree

2 files changed

+56
-7
lines changed

2 files changed

+56
-7
lines changed

modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -189,21 +189,27 @@ public function render() {
189189
*/
190190
protected function getTileSourceFromRow(ResultRow $row, $iiif_address, $iiif_base_id) {
191191
$canvases = [];
192-
foreach ($this->options['iiif_tile_field'] as $iiif_tile_field) {
192+
foreach (array_filter(array_values($this->options['iiif_tile_field'])) as $iiif_tile_field) {
193193
$viewsField = $this->view->field[$iiif_tile_field];
194+
$iiif_ocr_file_field = !empty($this->options['iiif_ocr_file_field']) ? array_filter(array_values($this->options['iiif_ocr_file_field'])) : [];
195+
$ocrField = count($iiif_ocr_file_field) > 0 ? $this->view->field[$iiif_ocr_file_field[0]] : NULL;
194196
$entity = $viewsField->getEntity($row);
195197

196198
if (isset($entity->{$viewsField->definition['field_name']})) {
197199

198200
/** @var \Drupal\Core\Field\FieldItemListInterface $images */
199201
$images = $entity->{$viewsField->definition['field_name']};
200-
foreach ($images as $image) {
202+
foreach ($images as $i => $image) {
201203
if (!$image->entity->access('view')) {
202204
// If the user does not have permission to view the file, skip it.
203205
continue;
204206
}
207+
208+
$ocrs = $entity->{$ocrField->definition['field_name']};
209+
205210
// Create the IIIF URL for this file
206211
// Visiting $iiif_url will resolve to the info.json for the image.
212+
$ocr = isset($ocrs[$i]) ? $ocrs[$i] : FALSE;
207213
$file_url = $image->entity->createFileUrl(FALSE);
208214
$mime_type = $image->entity->getMimeType();
209215
$iiif_url = rtrim($iiif_address, '/') . '/' . urlencode($file_url);
@@ -241,8 +247,7 @@ protected function getTileSourceFromRow(ResultRow $row, $iiif_address, $iiif_bas
241247
}
242248
}
243249
}
244-
245-
$canvases[] = [
250+
$tmp_canvas = [
246251
// @see https://iiif.io/api/presentation/2.1/#canvas
247252
'@id' => $canvas_id,
248253
'@type' => 'sc:Canvas',
@@ -271,6 +276,17 @@ protected function getTileSourceFromRow(ResultRow $row, $iiif_address, $iiif_bas
271276
],
272277
],
273278
];
279+
280+
if (isset($ocr) && $ocr != FALSE) {
281+
$tmp_canvas['seeAlso'] = [
282+
'@id' => $ocr->entity->createFileUrl(FALSE),
283+
'format' => 'text/vnd.hocr+html',
284+
'profile' => 'http://kba.cloud/hocr-spec',
285+
'label' => 'hOCR embedded text',
286+
];
287+
}
288+
289+
$canvases[] = $tmp_canvas;
274290
}
275291
}
276292
}
@@ -313,6 +329,7 @@ protected function defineOptions() {
313329
$options = parent::defineOptions();
314330

315331
$options['iiif_tile_field'] = ['default' => ''];
332+
$options['iiif_ocr_file_field'] = ['default' => ''];
316333

317334
return $options;
318335
}
@@ -368,6 +385,15 @@ public function buildOptionsForm(&$form, FormStateInterface $form_state) {
368385
// otherwise could lock up the form when setting up a View.
369386
'#required' => count($field_options) > 0,
370387
];
388+
389+
$form['iiif_ocr_file_field'] = [
390+
'#title' => $this->t('Structured OCR data file field'),
391+
'#type' => 'checkboxes',
392+
'#default_value' => $this->options['iiif_ocr_file_field'],
393+
'#description' => $this->t('The source of structured OCR text for each entity.'),
394+
'#options' => $field_options,
395+
'#required' => FALSE,
396+
];
371397
}
372398

373399
/**

modules/islandora_text_extraction/src/Plugin/Action/GenerateOCRDerivativeFile.php

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
use Drupal\islandora\Plugin\Action\AbstractGenerateDerivativeMediaFile;
99

1010
/**
11-
* Emits a Node for generating fits derivatives event.
11+
* Generates OCR derivatives event.
1212
*
1313
* @Action(
1414
* id = "generate_extracted_text_file",
@@ -29,6 +29,7 @@ public function defaultConfiguration() {
2929
$config['destination_media_type'] = 'file';
3030
$config['scheme'] = $this->config->get('default_scheme');
3131
$config['destination_text_field_name'] = '';
32+
$config['text_format'] = 'plain_text';
3233
return $config;
3334
}
3435

@@ -38,7 +39,7 @@ public function defaultConfiguration() {
3839
public function buildConfigurationForm(array $form, FormStateInterface $form_state) {
3940
$map = $this->entityFieldManager->getFieldMapByFieldType('text_long');
4041
$file_fields = $map['media'];
41-
$field_options = array_combine(array_keys($file_fields), array_keys($file_fields));
42+
$field_options = ['none' => $this->t('None')] + array_combine(array_keys($file_fields), array_keys($file_fields));
4243
$form = parent::buildConfigurationForm($form, $form_state);
4344
$form['mimetype']['#description'] = $this->t('Mimetype to convert to (e.g. application/xml, etc...)');
4445
$form['mimetype']['#value'] = 'text/plain';
@@ -48,13 +49,23 @@ public function buildConfigurationForm(array $form, FormStateInterface $form_sta
4849
$last = array_slice($form, count($form) - $position + 1);
4950

5051
$middle['destination_text_field_name'] = [
51-
'#required' => TRUE,
52+
'#required' => FALSE,
5253
'#type' => 'select',
5354
'#options' => $field_options,
5455
'#title' => $this->t('Destination Text field Name'),
5556
'#default_value' => $this->configuration['destination_text_field_name'],
5657
'#description' => $this->t('Text field on Media Type to hold extracted text.'),
5758
];
59+
$middle['text_format'] = [
60+
'#type' => 'select',
61+
'#title' => $this->t('Format'),
62+
'#options' => [
63+
'plain_text' => $this->t('Plain text'),
64+
'hocr' => $this->t('hOCR text with positional data'),
65+
],
66+
'#default_value' => $this->configuration['text_format'],
67+
'#description' => $this->t("The type of text to be returned."),
68+
];
5869
$form = array_merge($first, $middle, $last);
5970

6071
unset($form['args']);
@@ -81,17 +92,29 @@ public function validateConfigurationForm(array &$form, FormStateInterface $form
8192
public function submitConfigurationForm(array &$form, FormStateInterface $form_state) {
8293
parent::submitConfigurationForm($form, $form_state);
8394
$this->configuration['destination_text_field_name'] = $form_state->getValue('destination_text_field_name');
95+
$this->configuration['text_format'] = $form_state->getValue('text_format');
96+
switch ($form_state->getValue('text_format')) {
97+
case 'hocr':
98+
$this->configuration['args'] = '-c tessedit_create_hocr=1 -c hocr_font_info=0';
99+
break;
100+
101+
case 'plain_text':
102+
$his->configuration['args'] = '';
103+
break;
104+
}
84105
}
85106

86107
/**
87108
* Override this to return arbitrary data as an array to be json encoded.
88109
*/
89110
protected function generateData(EntityInterface $entity) {
111+
90112
$data = parent::generateData($entity);
91113
$route_params = [
92114
'media' => $entity->id(),
93115
'destination_field' => $this->configuration['destination_field_name'],
94116
'destination_text_field' => $this->configuration['destination_text_field_name'],
117+
'text_format' => $this->configuration['text_format'],
95118
];
96119
$data['destination_uri'] = Url::fromRoute('islandora_text_extraction.attach_file_to_media', $route_params)
97120
->setAbsolute()

0 commit comments

Comments
 (0)