summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'extract/src/docx.c')
-rw-r--r--extract/src/docx.c167
1 files changed, 85 insertions, 82 deletions
diff --git a/extract/src/docx.c b/extract/src/docx.c
index 761de176..ca6c5d78 100644
--- a/extract/src/docx.c
+++ b/extract/src/docx.c
@@ -95,7 +95,7 @@ static int s_docx_paragraph_empty(extract_alloc_t* alloc, extract_astring_t* con
content_state.font.size = 10;
content_state.font.bold = 0;
content_state.font.italic = 0;
-
+
if (s_docx_run_start(alloc, content, &content_state)) goto end;
//docx_char_append_string(content, " "); /*   is non-break space. */
if (s_docx_run_finish(alloc, NULL /*state*/, content)) goto end;
@@ -168,9 +168,9 @@ font. */
if (s_docx_run_finish(alloc, content_state, content)) goto end;
}
if (s_docx_paragraph_finish(alloc, content)) goto end;
-
+
e = 0;
-
+
end:
return e;
}
@@ -245,7 +245,7 @@ static int s_docx_append_image(
static int s_docx_output_rotated_paragraphs(
extract_alloc_t* alloc,
- extract_page_t* page,
+ subpage_t* subpage,
int paragraph_begin,
int paragraph_end,
int rot,
@@ -330,7 +330,7 @@ static int s_docx_output_rotated_paragraphs(
/* Output paragraphs p0..p2-1. */
for (p=paragraph_begin; p<paragraph_end; ++p) {
- paragraph_t* paragraph = page->paragraphs[p];
+ paragraph_t* paragraph = subpage->paragraphs[p];
if (s_document_to_docx_content_paragraph(alloc, state, paragraph, content)) goto end;
}
@@ -364,7 +364,7 @@ static int s_docx_output_rotated_paragraphs(
extract_astring_cat(alloc, content, " <w:txbxContent>");
for (p=paragraph_begin; p<paragraph_end; ++p) {
- paragraph_t* paragraph = page->paragraphs[p];
+ paragraph_t* paragraph = subpage->paragraphs[p];
if (s_document_to_docx_content_paragraph(alloc, state, paragraph, content)) goto end;
}
@@ -392,7 +392,7 @@ to the application. */
{
int e = -1;
int y;
-
+
if (extract_astring_cat(alloc, content,
"\n"
" <w:tbl>\n"
@@ -406,14 +406,14 @@ to the application. */
" <w:tr>\n"
" <w:trPr/>\n"
)) goto end;
-
+
for (x=0; x<table->cells_num_x; ++x)
{
cell_t* cell = table->cells[y*table->cells_num_x + x];
if (!cell->left) continue;
-
+
if (extract_astring_cat(alloc, content, " <w:tc>\n")) goto end;
-
+
/* Write cell properties. */
{
if (extract_astring_cat(alloc, content,
@@ -442,7 +442,7 @@ to the application. */
}
if (extract_astring_cat(alloc, content, " </w:tcPr>\n")) goto end;
}
-
+
/* Write contents of this cell. */
{
size_t chars_num_old = content->chars_num;
@@ -476,20 +476,20 @@ to the application. */
}
if (extract_astring_cat(alloc, content, " </w:tbl>\n")) goto end;
e = 0;
-
+
end:
return e;
}
static int s_docx_append_rotated_paragraphs(
extract_alloc_t* alloc,
- extract_page_t* page,
+ subpage_t* subpage,
content_state_t* state,
int* p,
int* text_box_id,
const matrix_t* ctm,
double rotate,
- extract_astring_t* content
+ extract_astring_t* output
)
/* Appends paragraphs with same rotation, starting with page->paragraphs[*p]
and updates *p. */
@@ -501,8 +501,8 @@ and updates *p. */
point_t extent = {0, 0};
int p0 = *p;
int p1;
- paragraph_t* paragraph = page->paragraphs[*p];
-
+ paragraph_t* paragraph = subpage->paragraphs[*p];
+
outf("rotate=%.2frad=%.1fdeg ctm: ef=(%f %f) abcd=(%f %f %f %f)",
rotate, rotate * 180 / pi,
ctm->e,
@@ -535,8 +535,8 @@ and updates *p. */
ctm->a, ctm->b, ctm->c, ctm->d);
}
- for (*p=p0; *p<page->paragraphs_num; ++(*p)) {
- paragraph = page->paragraphs[*p];
+ for (*p=p0; *p<subpage->paragraphs_num; ++(*p)) {
+ paragraph = subpage->paragraphs[*p];
ctm = &paragraph->lines[0]->spans[0]->ctm;
rotate = atan2(ctm->b, ctm->a);
if (rotate != rotate0) {
@@ -625,13 +625,13 @@ and updates *p. */
x -= dx;
y -= -dy;
- if (s_docx_output_rotated_paragraphs(alloc, page, p0, p1, rot, x, y, w, h, *text_box_id, content, state)) goto end;
+ if (s_docx_output_rotated_paragraphs(alloc, subpage, p0, p1, rot, x, y, w, h, *text_box_id, output, state)) goto end;
}
*p = p1 - 1;
e = 0;
-
+
end:
-
+
return e;
}
@@ -647,38 +647,40 @@ int extract_document_to_docx_content(
int ret = -1;
int text_box_id = 0;
int p;
-
+
/* Write paragraphs into <content>. */
for (p=0; p<document->pages_num; ++p) {
extract_page_t* page = document->pages[p];
-
- int p = 0;
- int t = 0;
-
- content_state_t content_state;
- content_state.font.name = NULL;
- content_state.font.size = 0;
- content_state.font.bold = 0;
- content_state.font.italic = 0;
- content_state.ctm_prev = NULL;
-
- /* Output paragraphs and tables in order of y coordinate. */
- for(;;)
- {
- paragraph_t* paragraph = (p == page->paragraphs_num) ? NULL : page->paragraphs[p];
- table_t* table = (t == page->tables_num) ? NULL : page->tables[t];
- double y_paragraph;
- double y_table;
- if (!paragraph && !table) break;
- y_paragraph = (paragraph) ? paragraph->lines[0]->spans[0]->chars[0].y : DBL_MAX;
- y_table = (table) ? table->pos.y : DBL_MAX;
-
- if (paragraph && y_paragraph < y_table)
- {
- const matrix_t* ctm = &paragraph->lines[0]->spans[0]->ctm;
- double rotate = atan2(ctm->b, ctm->a);
-
- if (spacing
+ int c;
+
+ for (c=0; c<page->subpages_num; ++c) {
+ subpage_t* subpage = page->subpages[c];
+
+ int p = 0;
+ int t = 0;
+
+ content_state_t content_state;
+ content_state.font.name = NULL;
+ content_state.font.size = 0;
+ content_state.font.bold = 0;
+ content_state.font.italic = 0;
+ content_state.ctm_prev = NULL;
+
+ /* Output paragraphs and tables in order of y coordinate. */
+ for(;;) {
+ paragraph_t* paragraph = (p == subpage->paragraphs_num) ? NULL : subpage->paragraphs[p];
+ table_t* table = (t == subpage->tables_num) ? NULL : subpage->tables[t];
+ double y_paragraph;
+ double y_table;
+ if (!paragraph && !table) break;
+ y_paragraph = (paragraph) ? paragraph->lines[0]->spans[0]->chars[0].y : DBL_MAX;
+ y_table = (table) ? table->pos.y : DBL_MAX;
+
+ if (paragraph && y_paragraph < y_table) {
+ const matrix_t* ctm = &paragraph->lines[0]->spans[0]->ctm;
+ double rotate = atan2(ctm->b, ctm->a);
+
+ if (spacing
&& content_state.ctm_prev
&& paragraph->lines_num
&& paragraph->lines[0]->spans_num
@@ -687,37 +689,38 @@ int extract_document_to_docx_content(
&paragraph->lines[0]->spans[0]->ctm
)
) {
- /* Extra vertical space between paragraphs that were at
- different angles in the original document. */
- if (s_docx_paragraph_empty(alloc, content)) goto end;
- }
+ /* Extra vertical space between paragraphs that were at
+ different angles in the original document. */
+ if (s_docx_paragraph_empty(alloc, content)) goto end;
+ }
- if (spacing) {
- /* Extra vertical space between paragraphs. */
- if (s_docx_paragraph_empty(alloc, content)) goto end;
- }
+ if (spacing) {
+ /* Extra vertical space between paragraphs. */
+ if (s_docx_paragraph_empty(alloc, content)) goto end;
+ }
- if (rotation && rotate != 0)
- {
- if (s_docx_append_rotated_paragraphs(alloc, page, &content_state, &p, &text_box_id, ctm, rotate, content)) goto end;
+ if (rotation && rotate != 0)
+ {
+ if (s_docx_append_rotated_paragraphs(alloc, subpage, &content_state, &p, &text_box_id, ctm, rotate, content)) goto end;
+ }
+ else
+ {
+ if (s_document_to_docx_content_paragraph(alloc, &content_state, paragraph, content)) goto end;
+ }
+ p += 1;
}
- else
+ else if (table)
{
- if (s_document_to_docx_content_paragraph(alloc, &content_state, paragraph, content)) goto end;
+ if (s_docx_append_table(alloc, table, content)) goto end;
+ t += 1;
}
- p += 1;
- }
- else if (table)
- {
- if (s_docx_append_table(alloc, table, content)) goto end;
- t += 1;
}
- }
-
- if (images) {
- int i;
- for (i=0; i<page->images_num; ++i) {
- s_docx_append_image(alloc, content, &page->images[i]);
+
+ if (images) {
+ int i;
+ for (i=0; i<subpage->images_num; ++i) {
+ s_docx_append_image(alloc, content, &subpage->images[i]);
+ }
}
}
}
@@ -759,7 +762,7 @@ int extract_docx_content_item(
extract_astring_t temp;
extract_astring_init(&temp);
*text2 = NULL;
-
+
if (0)
{}
else if (!strcmp(name, "[Content_Types].xml")) {
@@ -841,7 +844,7 @@ int extract_docx_content_item(
return e;
}
-
+
int extract_docx_write_template(
extract_alloc_t* alloc,
@@ -862,7 +865,7 @@ int extract_docx_write_template(
assert(path_out);
assert(path_template);
-
+
if (extract_check_path_shell_safe(path_out)) {
outf("path_out is unsafe: %s", path_out);
goto end;
@@ -889,7 +892,7 @@ int extract_docx_write_template(
/* Might be nice to iterate through all items in path_tempdir, but for now
we look at just the items that we know extract_docx_content_item() will
modify. */
-
+
{
const char* names[] = {
"word/document.xml",
@@ -904,7 +907,7 @@ int extract_docx_write_template(
extract_free(alloc, &text2);
if (extract_asprintf(alloc, &path, "%s/%s", path_tempdir, name) < 0) goto end;
if (extract_read_all_path(alloc, path, &text)) goto end;
-
+
if (extract_docx_content_item(
alloc,
contentss,
@@ -926,14 +929,14 @@ int extract_docx_write_template(
extract_free(alloc, &path);
if (extract_asprintf(alloc, &path, "%s/word/media", path_tempdir) < 0) goto end;
if (extract_mkdir(path, 0777)) goto end;
-
+
for (i=0; i<images->images_num; ++i) {
image_t* image = &images->images[i];
extract_free(alloc, &path);
if (extract_asprintf(alloc, &path, "%s/word/media/%s", path_tempdir, image->name) < 0) goto end;
if (extract_write_all(image->data, image->data_size, path)) goto end;
}
-
+
outf("Zipping tempdir to create %s", path_out);
{
const char* path_out_leaf = strrchr(path_out, '/');