/* QQ -- Reading Quark Xpress 4.1 for Windows Documents Copyright (C) 2002 Frans Faase This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version, but WITH THE EXPLICT AMMENDMENT: that any additional discoveries you make about the Quark Xpress file formats are made public under the GNU General Public License. (The most convient manner to do this, it to mail me the updated sources, and I will publish them on my website.) This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. GNU General Public License: http://www.iwriteiam.nl/GNU.txt Latest version and any additional documentation can be found on: http://www.iwriteiam.nl/QX.html */ #include #include #include #include #include /* Define some elementary types: */ #define TRUE 1 #define FALSE 0 typedef unsigned char byte; typedef unsigned short word; typedef unsigned long lword; /* Allocate a record of type T: */ #define ALLOC(T) (T*)malloc(sizeof(T)) /* Allocate an array of N records of type T: */ #define NALLOC(N,T) (T*)malloc((N)*sizeof(T)) /* Allocate space in which the string L would fit: */ #define SALLOC(L) (char*)malloc((strlen(L)+1)*sizeof(char)) /* Make an allocated copy of a string: */ char *strcopy(char *s) { char *r = SALLOC(s); strcpy(r,s); return r; } #define FORALL_IN_LIST(E,L) for (E = L; E != NULL; E = E->next) class CBuf { public: byte *buf; lword length; CBuf(void) { buf = NULL; length = 0L; } byte operator[](lword i) { return buf[i]; } bool read_file(char *file_name) { FILE *f = fopen(file_name, "rb"); if (f == NULL) return FALSE; int fh = fileno(f); length = lseek(fh, 0L, SEEK_END); lseek(fh, 0L, SEEK_SET); buf = NALLOC(length, byte); length = fread(buf, 1, length, f); fclose(f); return TRUE; } }; #define READ_TEXT #define P_CLOSE(X) /**/ printf X /**/ #define DUMP_BLOCKS(X) /**/ printf X /**/ #define DUMP_SKIP(X) /**/ printf X /**/ #define DUMP_TRAIL(X) /**/ printf X /**/ #define DUMP_FORMAT(X) /**/ printf X /**/ /************** Input stream **************/ struct { byte *buf; lword length; lword pos; lword blockend; word (*read_word)(void); lword (*read_lword)(void); word (*read_stream_word)(void); lword (*read_stream_lword)(void); void (*finish_stream)(void); } input; lword input_read_lword() { lword b1 = input.buf[input.pos++], b2 = input.buf[input.pos++], b3 = input.buf[input.pos++], b4 = input.buf[input.pos++]; return b1 + (b2 << 8) + (b3 << 16) + (b4 << 24); } word input_read_word() { lword b1 = input.buf[input.pos++], b2 = input.buf[input.pos++]; return b1 + (b2 << 8); } void input_stream_skip() { if (input.pos == input.blockend) { long nextblock; DUMP_SKIP(("\nSkip: At %0X ", input.pos)); nextblock = input_read_lword(); if (nextblock != 0) { if (nextblock < 0) { word nrblocks; input.pos = (-nextblock) * 256 - 256; nrblocks = input_read_word(); input.blockend = input.pos - 2 + 256 * nrblocks - 4; } else { input.pos = nextblock * 256 - 256; input.blockend = input.pos + 256 - 4; } DUMP_SKIP(("\nSkip: Start new block at %8X till %8X", input.pos, input.blockend)); } } } word input_read_stream_word() { input_stream_skip(); return input_read_word(); } lword input_read_stream_lword() { lword w1 = input_read_stream_word(); lword w2 = input_read_stream_word(); return (w2 << 16) | w1; } void input_finish_stream() { DUMP_SKIP(("\nFinish from %ld till %d: ", input.pos, input.blockend)); while (input.pos < input.blockend) { byte b = input.buf[input.pos++]; /* if (b != 0) DUMP_SKIP((" %02X", b)); */ input_stream_skip(); } DUMP_SKIP(("\n")); } void init_input() { input.length = 0L; input.pos = 0L; input.read_lword = input_read_lword; input.read_word = input_read_word; input.read_stream_lword = input_read_stream_lword; input.read_stream_word = input_read_stream_word; input.finish_stream = input_finish_stream; } /******* storage of fonts **********/ #define FONT_PLAIN 0 #define FONT_BOLD 1 #define FONT_ITALIC 2 typedef struct { word id; char *family_name; char *names[4]; } font_t; font_t *fonts = NULL; word nr_fonts = 0; font_t *null_font() { static bool initialized = FALSE; static font_t font; if (!initialized) { font.id = 0; font.family_name = "Unknown"; font.names[0] = "Unknown"; font.names[1] = "UnknownBold"; font.names[2] = "UnknownItalic"; font.names[3] = "UnknownBoldItalic"; initialized = TRUE; } return &font; } font_t *get_font(word id) { word i; for (i = 0; i < nr_fonts; i++) if (fonts[i].id == id) return &fonts[i]; return null_font(); } void print_fonts() { word i; for (i = 0; i < nr_fonts; i++) { printf("%3d %6d %s: %s %s %s %s\n", i, fonts[i].id, fonts[i].family_name, fonts[i].names[FONT_PLAIN], fonts[i].names[FONT_BOLD], fonts[i].names[FONT_ITALIC], fonts[i].names[FONT_BOLD+FONT_ITALIC]); } } /******* storage of paragraph style definitions *****/ typedef struct par_style_def_T { struct par_style_def_T *next; word id; char *name; font_t *def_font; } par_style_def_t; par_style_def_t *par_style_defs = NULL; par_style_def_t *null_par_style_def() { static bool initialized = FALSE; static par_style_def_t par_style_def; if (!initialized) { par_style_def.next = &par_style_def; par_style_def.id = 0; par_style_def.name = "PLAIN"; par_style_def.def_font = null_font(); initialized = TRUE; } return &par_style_def; } void add_par_style_def(word id, char *name, font_t *def_font) { par_style_def_t **r_par_style_def; for (r_par_style_def = &par_style_defs; *r_par_style_def != NULL; r_par_style_def = &(*r_par_style_def)->next); (*r_par_style_def) = ALLOC(par_style_def_t); (*r_par_style_def)->next = NULL; (*r_par_style_def)->id = id; (*r_par_style_def)->name = name; (*r_par_style_def)->def_font = def_font; } par_style_def_t *get_par_style_def(word id) { par_style_def_t *par_style_def; FORALL_IN_LIST(par_style_def, par_style_defs) if (par_style_def->id == id) return par_style_def; return null_par_style_def(); } void print_par_style_def() { par_style_def_t *par_style_def; FORALL_IN_LIST(par_style_def, par_style_defs) { printf("%3d %s", par_style_def->id, par_style_def->name); if (par_style_def->def_font != NULL) printf(" %s", par_style_def->def_font->family_name); printf("\n"); } } /******* storage of character styles *******/ typedef struct { font_t *font; byte mode; word size; } char_style_t; char_style_t *char_styles = NULL; word nr_char_styles = 0; char_style_t *null_char_style() { static bool initialized = FALSE; static char_style_t char_style; if (!initialized) { char_style.font = null_font(); char_style.mode = 0; char_style.size = 0; initialized = TRUE; } return &char_style; } char_style_t *char_style(word i) { return i < nr_char_styles ? &char_styles[i] : null_char_style(); } char char_style_mode_char(char_style_t *char_style) { char *mode_ch = "NBIX"; return mode_ch[char_style->mode & 3]; } /******* storage of paragraph styles *******/ typedef struct { par_style_def_t *par_style_def; } par_style_t; par_style_t *par_styles = NULL; word nr_par_styles = 0; par_style_t *null_par_style() { static bool initialized = FALSE; static par_style_t par_style; if (!initialized) { par_style.par_style_def = null_par_style_def(); initialized = TRUE; } return &par_style; } par_style_t *par_style(word i) { return i < nr_par_styles ? &par_styles[i] : null_par_style(); } /******* storage of texts *********/ typedef struct text_T text_t; typedef struct sub_text_T sub_text_t; typedef struct { lword len; lword code; } style_t; struct text_T { text_t *next; unsigned char *text; lword text_len; lword id; style_t *char_styles; lword nr_char_styles; style_t *par_styles; lword nr_par_styles; sub_text_t *sub_texts; }; struct sub_text_T { sub_text_t *next; text_t *text; struct frame_T *frame; lword start; }; text_t *new_text() { text_t *text = ALLOC(text_t); text->next = NULL; text->text = NULL; text->text_len = 0; text->id = 0; text->char_styles = NULL; text->nr_char_styles = 0; text->par_styles = NULL; text->nr_par_styles = 0; text->sub_texts = NULL; return text; } text_t *get_text_with_id(text_t *texts, lword id) { text_t *text; FORALL_IN_LIST(text, texts) if (text->id == 0 || text->id == id) { text->id = id; return text; } else if (text->id > id) { lword keep_id = text->id; sub_text_t *keep_sub_texts = text->sub_texts; text_t *result_text; text->id = id; text->sub_texts = NULL; result_text = text; while (keep_id != 0) { lword id; sub_text_t *sub_texts; sub_text_t *sub_text; text = text->next; if (text == NULL) { printf("To few texts\n"); FORALL_IN_LIST(sub_texts, keep_sub_texts) sub_texts->text = NULL; return result_text; } id = text->id; text->id = keep_id; keep_id = id; sub_texts = text->sub_texts; text->sub_texts = keep_sub_texts; keep_sub_texts = sub_texts; FORALL_IN_LIST(sub_text, text->sub_texts) sub_text->text = text; } return result_text; } return NULL; } void init_sub_text(sub_text_t *_this, text_t *text, lword start) { sub_text_t **r_sub_text; _this->text = text; _this->start = start; for (r_sub_text = &text->sub_texts; *r_sub_text != NULL && (*r_sub_text)->start < start; r_sub_text = &(*r_sub_text)->next); _this->next = *r_sub_text; *r_sub_text = _this; } lword sub_text_end(sub_text_t *_this) { if (_this->next != NULL) return _this->next->start; else return _this->text->text_len; } /************* Read texts **************/ void read_text(text_t *text) { unsigned char *cur_text; unsigned char *end_text; lword len_text_pointers; word nr_text_pointers; lword max_text_pos = input.pos; int i; DUMP_BLOCKS(("\nBlock: %4d %06x ", input.pos / 256, input.pos)); input.blockend = input.pos + 256 - 4; /* Read nr of characters in _this text fragment, and allocate string */ text->text_len = input.read_lword(); text->text = NALLOC(text->text_len, unsigned char); cur_text = text->text; end_text = text->text + text->text_len; /* Read the nr of pointers to blocks containing the text */ len_text_pointers = input.read_lword(); nr_text_pointers = len_text_pointers / 6; DUMP_FORMAT(("\nText: nr chars = %ld text_pointers %08X %d", text->text_len, len_text_pointers, nr_text_pointers)); /* Read the text from the indicate blocks */ for (i = 0; i < nr_text_pointers; i++) { lword text_pointer; word text_length; lword text_pos; int j; text_pointer = input.read_stream_lword(); text_length = input.read_stream_word(); if (text_length > 256) { printf("Error (read_text): text lenght = %d\n", text_length); exit(1); } DUMP_FORMAT(("\nText: %08X %d", text_pointer, text->text_len)); text_pos = (text_pointer - 1) * 256; for (j = 0; j < text_length; j++) if (cur_text == end_text) { printf("\nError (read_text): more characters than specified %ld\n", text->text_len); exit(1); } else *(cur_text++) = input.buf[text_pos++]; if (j < 256) { DUMP_TRAIL(("\nText: Trail %d: ",j)); for (; j < 256; j++) { byte b = input.buf[text_pos++]; DUMP_TRAIL((" %02X", b)); } } DUMP_TRAIL(("\nText: pos = %X %X\n", text_pos, max_text_pos)); if (text_pos > max_text_pos) max_text_pos = text_pos; } if (cur_text < end_text) { printf("\nError (read_text): less characters %d than specified %ld\n", cur_text - text->text, text->text_len); exit(1); } /* Read nr of character styles, and allocate styles */ text->nr_char_styles = input.read_stream_lword() / 6; text->char_styles = NALLOC(text->nr_char_styles, style_t); DUMP_FORMAT(("\nText: nr ch styles = %ld", text->nr_char_styles)); /* read the character styles */ for (i = 0; i < text->nr_char_styles; i++) { text->char_styles[i].code = input.read_stream_word(); text->char_styles[i].len = input.read_stream_lword(); DUMP_FORMAT(("\nText: %6X ch style[%d] len %ld code %d", input.pos, i, text->char_styles[i].len, text->char_styles[i].code)); } DUMP_FORMAT(("\nText: Reading par styles from %x", input.pos)); /* read nr of paragraph styles */ text->nr_par_styles = input.read_stream_lword() / 6; text->par_styles = NALLOC(text->nr_par_styles, style_t); DUMP_FORMAT(("\nText: nr par styles = %ld", text->nr_char_styles)); /* read the paragraph styles */ for (i = 0; i < text->nr_par_styles; i++) { text->par_styles[i].code = input.read_stream_word(); text->par_styles[i].len = input.read_stream_lword(); DUMP_FORMAT(("\nText: %6X par style[%d] len %ld code %d", input.pos, i, text->par_styles[i].len, text->par_styles[i].code)); } input.finish_stream(); if (input.pos < max_text_pos) input.pos = max_text_pos; DUMP_FORMAT(("\n")); } text_t *read_texts(lword begin_text, lword end_text) { text_t *texts = NULL; text_t **r_text = &texts; DUMP_SKIP(("text starts at %lx and end at %lx\n", begin_text, end_text)); for (input.pos = begin_text; input.pos < end_text; ) { text_t *text = new_text(); *r_text = text; r_text = &(text->next); read_text(text); } DUMP_SKIP(("\n")); return texts; } /************* text read iterator ********/ typedef struct { lword i; unsigned char ch; char_style_t *char_style; par_style_t *par_style; /* private */ text_t *text; lword next_char_style; lword char_style_till; lword next_par_style; lword par_style_till; } text_read_t; void init_text_read(text_read_t *_this, text_t *text) { _this->text = text; _this->i = 0; _this->next_char_style = 0; _this->char_style_till = 0; _this->next_par_style = 0; _this->par_style_till = 0; } bool more_text_read(text_read_t *_this) { text_t *_this_text = _this->text; if (_this->i >= _this_text->text_len) return FALSE; if (_this->i == _this->par_style_till && _this->next_par_style < _this_text->nr_par_styles) { _this->par_style = par_style(_this_text->par_styles[_this->next_par_style].code); _this->par_style_till += _this_text->par_styles[_this->next_par_style].len; _this->next_par_style++; } if (_this->i == _this->char_style_till && _this->next_char_style < _this_text->nr_char_styles) { _this->char_style = char_style(_this_text->char_styles[_this->next_char_style].code); _this->char_style_till += _this_text->char_styles[_this->next_char_style].len; _this->next_char_style++; } _this->ch = _this_text->text[_this->i]; return TRUE; } void next_text_read(text_read_t *_this) { _this->i++; } void proc_text_read(text_read_t *_this) { _this->par_style = null_par_style(); _this->char_style = null_char_style(); } /*************** print texts as XML *************/ void print_text_range_as_XML(text_t *text, lword start, lword end, FILE *f, word indent, bool full) { par_style_t *cur_par_style = null_par_style(); char_style_t *cur_char_style = null_char_style(); text_read_t text_read; for (init_text_read(&text_read, text); more_text_read(&text_read) && text_read.i < end; next_text_read(&text_read)) if (text_read.i >= start) { bool char_style_changed = FALSE; if (text_read.char_style != null_char_style()) { char_style_changed = cur_char_style == null_char_style() || text_read.char_style->size != cur_char_style->size || text_read.char_style->mode != cur_char_style->mode || strcmp(text_read.char_style->font->family_name, cur_char_style->font->family_name); } if (text_read.par_style != null_par_style() || char_style_changed) { if (full && cur_char_style != null_char_style()) fprintf(f, ""); if (text_read.par_style != null_par_style()) { if (cur_par_style != null_par_style()) { if (full) fprintf(f, ""); fprintf(f, "\n"); } cur_par_style = text_read.par_style; fprintf(f, "%*.*s", indent, indent, " ", cur_par_style->par_style_def->name); } if (char_style_changed) cur_char_style = text_read.char_style; if (cur_char_style != null_char_style()) fprintf(f, "", cur_char_style->font->family_name, char_style_mode_char(cur_char_style), cur_char_style->size); } { byte ch = text_read.ch; if (ch == '&') fprintf(f, "&"); else if (ch == '<') fprintf(f, "<"); else if (ch == '>') fprintf(f, ">"); else if (ch >= ' ' && ch < 126) fprintf(f, "%c", ch); else { word chval = ch; //fprintf(f, "&#%d;", chval); } } proc_text_read(&text_read); } if (full && cur_char_style != null_char_style()) fprintf(f, ""); if (full && cur_par_style != null_par_style()) fprintf(f, ""); fprintf(f, "\n"); } void print_text_as_XML(text_t *text, FILE *f, word indent, bool full) { print_text_range_as_XML(text, 0, text->text_len, f, indent, full); } void print_texts_as_XML(text_t *texts, FILE *f, word indent, bool full) { text_t *text; FORALL_IN_LIST(text, texts) print_text_as_XML(text, f, indent, full); } void print_sub_text_as_XML(sub_text_t *sub_text, FILE *f, word indent, bool full) { if (sub_text->text != NULL) print_text_range_as_XML(sub_text->text, sub_text->start, sub_text_end(sub_text), f, indent, full); } /************ Boxes *********************/ typedef struct { lword x1, y1, x2, y2; } box_t; /************ Frames and pages *********************/ typedef struct page_T page_t; typedef struct frame_T frame_t; struct page_T { page_t *next; frame_t **frames; lword nr_frames; }; page_t *new_page(page_t **pages) { page_t *page = ALLOC(page_t); page->next = NULL; page->frames = NULL; page->nr_frames = 0; if (pages != NULL) { page_t **r_page; for (r_page = pages; *r_page != NULL; r_page = &(*r_page)->next); *r_page = page; } return page; } void page_set_nr_frames(page_t *_this, lword nr_frames) { lword i; _this->nr_frames = nr_frames; _this->frames = NALLOC(nr_frames, frame_t*); for (i = 0; i < nr_frames; i++) _this->frames[i] = 0; } struct frame_T { page_t *page; box_t box; lword nr_in_file; lword id; byte type; sub_text_t sub_text; frame_t *parent; frame_t **children; word nr_children; frame_t *sub_frames; }; frame_t *new_frame(page_t *page, lword pos) { frame_t *frame = ALLOC(frame_t); frame->page = page; frame->nr_in_file = 0; frame->id = 0; frame->type = 0; frame->sub_text.next = NULL; frame->sub_text.text = NULL; frame->sub_text.frame = frame; frame->sub_frames = NULL; frame->parent = NULL; frame->children = NULL; frame->nr_children = 0; if (page != NULL) { page->frames[pos] = frame; } return frame; } bool frame_equal(frame_t *a, frame_t *b) { return (a->box.x1 == b->box.x1) && (a->box.x2 == b->box.x2) && (a->box.y1 == b->box.y1) && (a->box.y2 == b->box.y2); } bool frame_inside(frame_t *inner, frame_t *outer) { return (inner->box.x1 >= outer->box.x1) && (inner->box.x2 <= outer->box.x2) && (inner->box.y1 >= outer->box.y1) && (inner->box.y2 <= outer->box.y2) && !frame_equal(inner, outer); } int compare_frames(const void *e1, const void *e2) { frame_t **f1 = (frame_t **)e1; box_t *b1 = &(*f1)->box; frame_t **f2 = (frame_t **)e2; box_t *b2 = &(*f2)->box; bool b1leftb2, b2leftb1, b1beforeb2, b2beforeb1; if ((b1->x1 > 70000000) < (b2->x1 > 70000000)) return -1; if ((b1->x1 > 70000000) > (b2->x1 > 70000000)) return 1; b1leftb2 = b1->x1 < b2->x1 && b1->y1 < b2->y2; b2leftb1 = b2->x1 < b1->x1 && b2->y1 < b1->y2; b1beforeb2 = b1->y1 <= b2->y1 || b1leftb2; b2beforeb1 = b2->y1 <= b1->y1 || b2leftb1; if (b1beforeb2) if (b2beforeb1) { if (b1leftb2) return -1; if (b2leftb1) return 1; } else return -1; else if (b2beforeb1) return 1; if (b1->y2 > b2->y2) return -1; if (b1->y2 < b2->y2) return 1; if (b1->x2 > b2->x2) return -1; if (b1->x2 < b2->x2) return 1; return 0; } void organize_page(page_t *page) { frame_t **frames = page->frames; word nr_frames = page->nr_frames; word i; for (i = 0; i < nr_frames; i++) { frame_t *frame = frames[i], *outer = NULL; word j; for (j = 0; j < nr_frames; j++) { frame_t *_try = frames[j]; if (frame_inside(frame, _try)) if (outer == NULL || frame_inside(_try, outer)) outer = _try; } if (outer) { /* printf("%d parent of %d\n", outer->nr_in_file, frame->nr_in_file);*/ frame->parent = outer; outer->nr_children++; } } for (i = 0; i < nr_frames; i++) { frame_t *frame = frames[i]; frame->children = NALLOC(frame->nr_children, frame_t*); frame->nr_children = 0; } for (i = 0; i < nr_frames; i++) { frame_t *frame = frames[i]; frame_t *parent = frame->parent; if (parent != NULL) parent->children[parent->nr_children++] = frame; } qsort(frames, nr_frames, sizeof(frame_t *), &compare_frames); for (i = 0; i < nr_frames; i++) { frame_t *frame = frames[i]; if (frame->nr_children > 0) qsort(frame->children, frame->nr_children, sizeof(frame_t *), &compare_frames); } } void print_frame(FILE *f, frame_t *frame, word level) { word i; fprintf(f, "%*.*s\n", level, level, "", frame->nr_in_file, frame->sub_text.text ? frame->sub_text.text->id : (lword)-1, frame->sub_text.text ? frame->sub_text.start : (lword)-1, frame->sub_text.text ? sub_text_end(&frame->sub_text) : (lword)-1, frame->box.x1, frame->box.y1, frame->box.x2, frame->box.y2); for (i = 0; i < frame->nr_children; i++) print_frame(f, frame->children[i], level+2); print_sub_text_as_XML(&frame->sub_text, f, level+2, TRUE); fprintf(f, "%*.*s\n", level, level, ""); } /************ Analyse binary data *******/ byte *data; lword dlen = 0L; lword dpos = 0L; bool echo; bool echo_all; bool translate; void dump_context(lword p) { int i,k; /*printf("\nFound error at %lX\n bytes before:", p); for (k = -64; k < 0; k += 16) { printf("\n "); for (i = k; i < k+16; i++) if (p+i > 0L) printf("%02X ", data[p+i]); } printf("\n bytes at:"); */ for (k =0; k < 1024; k += 32) { printf("\n"); for (i = k; i < k+32; i++) if (p+i < dlen) printf("%02X ", data[p+i]); } printf("\n"); exit(1); } word dword(word pos) { lword b1 = data[pos], b2 = data[pos+1]; return b1 + (b2 << 8); } lword dlword(lword pos) { lword b1 = data[pos], b2 = data[pos+1], b3 = data[pos+2], b4 = data[pos+3]; return b1 + (b2 << 8) + (b3 << 16) + (b4 << 24); } void expect_long(lword val, char *info) { if (dlword(dpos) != val) { printf("\nexpect_long: v[%ld] = %08lX %s\n", dpos, val, info); dump_context(dpos); } if (echo_all) printf("%02X_%02X_%02X_%02X_", data[dpos], data[dpos+1], data[dpos+2], data[dpos+3]); dpos += 4; } void expect_short(word val, char *info) { if (data[dpos] + (data[dpos+1] << 8) != val) { printf("\nexpect_short: v[%ld] = %04X %s\n", dpos, val, info); dump_context(dpos); } if (echo_all) printf("%02X_%02X_", data[dpos], data[dpos+1]); dpos += 2; } void parse_bytes(int l) { for(; l > 0; l--) { if (echo) printf(data[dpos] > ' ' && data[dpos] < 127 ? " %c " : "%02X ", data[dpos]); dpos++; } } #define STR_BUF_LEN 1000 char string_buf[STR_BUF_LEN+1]; char *parse_string() { word i = 0; if (echo) printf("\""); while(data[dpos] != '\0') { if (echo) printf(data[dpos] >= ' ' && data[dpos] < 127 ? "%c" : "(%02X)", data[dpos]); if (i < STR_BUF_LEN) string_buf[i++] = data[dpos]; dpos++; } if (echo) printf("\" "); dpos++; string_buf[i] = '\0'; return string_buf; } char *parse_2string() { word i = 0; if (echo) printf("\""); while(data[dpos] != '\0') { if (echo) printf(data[dpos] >= ' ' && data[dpos] < 127 ? "%c" : "(%02X)", data[dpos]); if (i < STR_BUF_LEN) string_buf[i++] = data[dpos]; dpos++; } if (echo) printf("\" "); dpos++; if (dpos % 2) dpos++; string_buf[i] = '\0'; return string_buf; } lword parse_long() { lword r = dlword(dpos); if (echo) printf("%02X %02X %02X %02X ", data[dpos], data[dpos+1], data[dpos+2], data[dpos+3]); if (translate) printf("(l:%ld) ", r); dpos += 4; return r; } byte parse_byte() { byte r = data[dpos]; if (echo) printf("%02X ", data[dpos]); if (translate) printf("(b:%d) ", data[dpos]); dpos += 1; return r; } word parse_short() { word r = data[dpos] + (data[dpos+1] << 8); if (echo) printf("%02X %02X ", data[dpos], data[dpos+1]); if (translate) printf("(s:%d) ", r); dpos += 2; return r; } void parse_box(box_t *box) { if (translate) printf("["); box->y1 = parse_long(); box->x1 = parse_long(); box->y2 = parse_long(); box->x2 = parse_long(); if (translate) printf("]"); } void parse_font_families(lword end_dpos) { word i; if (echo) printf("\nFonts\nNr= "); nr_fonts = parse_short(); if (echo) printf("\n"); fonts = NALLOC(nr_fonts, font_t); for (i = 0; i < nr_fonts; i++) { fonts[i].id = parse_short(); fonts[i].family_name = strcopy(parse_string()); fonts[i].names[0] = strcopy(parse_string()); fonts[i].names[1] = ""; fonts[i].names[2] = ""; fonts[i].names[3] = ""; if (echo) printf("\n"); } if (dpos != end_dpos) printf("dpos = %ld, end_dpos = %ld\n", dpos, end_dpos); printf("\n"); } void parse_fonts(lword end_dpos) { if (echo) printf("\nFont families\n"); parse_bytes(12); if (echo) printf("\n"); while (dpos < end_dpos) { word font_id; byte nr[4]; char *s[4]; font_t *font; font_id = parse_short(); parse_long(); parse_long(); nr[0] = parse_byte(); nr[1] = parse_byte(); nr[2] = parse_byte(); nr[3] = parse_byte(); s[1] = strcopy(parse_string()); s[2] = strcopy(parse_string()); s[3] = strcopy(parse_string()); font = get_font(font_id); if (font != NULL) { int j; /* ?? _this is not complete correct ?? */ s[0] = font->names[0]; for (j = 0; j < 4; j++) if (nr[j] >= 1 && nr[j] <= 4) font->names[j] = s[nr[j]-1]; else font->names[j] = ""; break; } if (echo) printf("\n"); } if (echo) { if (dpos != end_dpos) printf("dpos = %ld, end_dpos = %ld\n", dpos, end_dpos); printf("\n"); } } void parse_colours(lword end_dpos) { if (echo) printf("\nColors\n"); parse_bytes(34); if (echo) printf("\n"); while (dpos < end_dpos) { parse_bytes(50); if (echo) printf("\n"); parse_2string(); if (echo) printf("\n"); } if (echo) { if (dpos != end_dpos) printf("dpos = %ld, end_dpos = %ld\n", dpos, end_dpos); printf("\n"); } } void parse_par_style_defs(lword end_dpos) { word i = 0; if (echo) printf("\nParagraph style definitions\n\n"); while (dpos < end_dpos) { word font_id; long id; char *name; parse_short(); font_id = parse_short(); parse_bytes(294); id = parse_long(); parse_long(); if (echo) printf("\n%3d : ", i++); name = strcopy(parse_2string()); { char *s; for (s = name; *s != '\0'; s++) if (*s == '<' || *s == '>') *s = '_'; } if (echo) printf("\n"); add_par_style_def(id, name, get_font(font_id)); } if (echo) { if (dpos != end_dpos) printf("dpos = %ld, end_dpos = %ld\n", dpos, end_dpos); printf("\n"); } } void parse_char_styles(lword end_dpos) { if (echo) printf("\nCharacter styles\n\n"); if ((end_dpos - dpos) % 46 == 0) { word i; nr_char_styles = (end_dpos - dpos) / 46; char_styles = NALLOC(nr_char_styles, char_style_t); for (i = 0; i < nr_char_styles; i++) { word font_id; byte mode; word size; if (echo) printf("%3d : ", i); parse_short(); font_id = parse_short(); mode = parse_byte(); parse_bytes(3); size = parse_short(); parse_bytes(36); if (echo) printf("\n"); char_styles[i].font = get_font(font_id); char_styles[i].mode = mode; char_styles[i].size = size; } } if (echo) { if (dpos != end_dpos) printf("dpos = %ld, end_dpos = %ld\n", dpos, end_dpos); printf("\n"); } } void parse_par_styles(lword end_dpos) { if (echo) printf("\nParagraph styles\n\n"); if ((end_dpos - dpos) % 256 == 0) { word i; nr_par_styles = (end_dpos - dpos) / 256; par_styles = NALLOC(nr_par_styles, par_style_t); for (i = 0; i < nr_par_styles; i++) { word par_style_def_id; if (echo) printf("%3d : ", i); parse_bytes(250); par_style_def_id = parse_short(); parse_long(); if (echo) printf("\n"); par_styles[i].par_style_def = get_par_style_def(par_style_def_id); } } if (echo) { if (dpos != end_dpos) printf("dpos = %ld, end_dpos = %ld\n", dpos, end_dpos); printf("\n"); } } void parse_extra() { lword esize; lword elen; lword j; if (echo) printf("Extra: "); expect_short(0, "s5"); if (echo) printf("len="); elen = parse_long(); if (echo) printf("nr="); esize = parse_long(); if (elen * 4 != esize) { printf("\nlen and size do not match\n"); dump_context(dpos); } for (j = 0; j < elen; j++) parse_long(); } #define ANY_V 0xFFFFFF bool parse_tail(lword a, lword b) { if (echo) printf("%lX extra ", a); if (dlword(dpos) == a && data[dpos+4] == 0 && data[dpos+5] == 0 && (b == ANY_V || dlword(dpos+6) == b) && data[dpos+10] == 0 && data[dpos+11] == 0) { if (echo) printf("yes !! "); expect_long(a, "y1"); expect_short(0, "y2"); if (b == ANY_V) parse_long(); else expect_long(b, "y3"); expect_short(0, "y4"); return TRUE; } else { if (echo) printf("not !! "); return FALSE; } } int c; int swap_mode; void parse_frame(frame_t *frame, text_t *texts) { int no_text; int t2; int t0; word v; lword text_id; static lword prev_text_id = 0; if (echo) printf(" B(%d)=", c); frame->nr_in_file = c; frame->id = parse_long(); /* frame id?? */ no_text = data[dpos]; parse_short(); parse_long(); c++; parse_short(); parse_short(); v = parse_short(); if (v != 0 && v != 0xFFA6) { printf("\ns1.0 must be 0 or FFA6\n"); dump_context(dpos); } expect_short(0, "s1.1"); expect_short(0, "s1.2"); if (echo) printf("text id="); text_id = parse_long(); expect_long(0, "s2"); if (echo) printf("t0="); t0 = parse_short(); parse_short(); parse_short(); frame->type = parse_byte(); if (no_text != (frame->type == 2)) { printf("\nno_text and frame_type do not match\n"); dump_context(dpos); } t2 = parse_byte(); expect_long(0, "s3"); parse_box(&frame->box); if (frame->type == 2) { /* empty frame */ /*lword w =*/ parse_long(); /* 0xd9b3, 0xd99a */ expect_short(0, "short2"); } else { lword offset_in_text; word t3, t4; expect_long(0, "s4.1"); expect_long(0x10000, "s4.2"); if (data[dpos] == 0 && data[dpos+1] == 0) { parse_extra(); } else { int new_swap_mode; lword v, has_text, segnr; bool has_tail = FALSE; expect_long(0x8001, "s7"); has_text = parse_long(); parse_short(); if (echo) printf("Offset="); offset_in_text = parse_long(); t3 = parse_short(); t4 = parse_short(); if ((t3 == 3 || t3 == 4) && t4 == 0) new_swap_mode = 1; else if (t3 == 0 && t4 == 0x20) new_swap_mode = 2; else if (t3 == 4 && t4 == 2) ; else if (t3 == 4 && t4 == 3) ; else if (t3 == 4 && t4 == 4) ; else if (t3 == 0 && t4 == 0x24) ; else if (t3 == 3 && t4 == 4) ; else { printf("\nst3,t4 should be 3 or 4 or 200000h\n"); dump_context(dpos); } parse_long(); /*0xbffc2 "s8.1" */ expect_long(0, "s8.2"); expect_long(0, "s8.3"); expect_long(0, "s8.4"); expect_long(0, "s8.5"); expect_long(0, "s8.6"); expect_long(0, "s8.7"); v = parse_long(); /* nr columns in frame */ if (v != 1 && v != 2) { printf("\ns8.8 should be 1 or 2h\n"); dump_context(dpos); } expect_long(0, "s8.9"); expect_short(0, "s9.1"); if (echo) printf("segnr="); segnr = parse_long(); if ((offset_in_text > 0) && (t3 != 4)) if (echo) printf("Offset <> t3 == 4 "); if (offset_in_text > 0) { lword v; parse_long(); v = parse_long(); if (v != 0 && v != 0x160 && v != 0x30000 && v != 0x10000) { printf("\ns9.2 should be 0 or 0x160 or 0x30000 or 0x10000 not %lx\n", v); dump_context(dpos); } } else { lword v1; word s1; expect_short(0,"s10.3"); expect_long(0,"s10.4"); parse_short(); /* 0 or 1 */ expect_long(0,"s10.5"); v1 = parse_long(); s1 = parse_short(); if (v1 != 0) { lword v2len; v1 = parse_short(); if (v1 != 0 && v1 != 4) { printf("\ns10.7b expect 0 or 4\n"); dump_context(dpos); } v2len = parse_long(); parse_bytes(v2len); } else { word v; v = parse_short(); if (v == 1) has_tail = has_tail || parse_tail(0x6D, 0xC); } } if ((t0 == 7 || t0 == 10) && swap_mode && swap_mode != new_swap_mode) has_tail = has_tail || parse_tail(0x70, ANY_V); if (t0 == 0 && has_text == 1) has_tail = has_tail || parse_tail(0x29, 3); if (t0 == 0xE) has_tail = has_tail || parse_tail(0x26, 1); if (t0 == 0xA) has_tail = has_tail || parse_tail(0x26, 3); if (t3 == 4 && t4 == 3) has_tail = has_tail || parse_tail(0x3F, 0); if (t3 == 4 && t4 == 2) has_tail = has_tail || parse_tail(0x23, 2); swap_mode = new_swap_mode; if (frame->type == 3 && !has_tail) { text_t *text = get_text_with_id(texts, text_id); if (text != NULL) init_sub_text(&frame->sub_text, text, offset_in_text); } } } if (frame->type == 3) { printf("\nxxx %ld %ld %s\n", text_id, text_id - prev_text_id, frame->sub_text.text ? "" : "empty"); prev_text_id = text_id; } if (echo) printf(" end frame\n"); /*print_sub_text_as_XML(&frame->sub_text, stdout, 0, TRUE);*/ } void parse_page(page_t *page, text_t *texts) { lword esize; lword elen; lword nrt1; lword i,j; lword nr_frames; box_t dummy_box; if (echo) printf(" Page: "); parse_short(); esize = parse_long(); elen = parse_short(); parse_short(); if (elen * 60 + 4 != esize) { printf("\nsize and len of t1 do not match\n"); dump_context(dpos); } nrt1 = elen; for(j = 0; j < elen; j++) { if (echo) printf("t1("); parse_short(); parse_short(); parse_box(&dummy_box); parse_long(); parse_short(); parse_long(); parse_short(); parse_short(); parse_box(&dummy_box); parse_short(); parse_short(); parse_short(); parse_byte(); parse_byte(); parse_short(); if (echo) printf(")"); } for (i = 0; i < nrt1; i++) { esize = parse_long(); elen = parse_long(); if (elen * 14 + 4 != esize) { printf("\nsize and len of t2 do not match\n"); dump_context(dpos); } for(j = 0; j < elen; j++) { if (echo) printf("t2("); parse_long(); parse_long(); parse_long(); parse_short(); if (echo) printf(")"); } expect_long(0, "s6"); } expect_long(4, "t5.1"); expect_long(0, "t5.2"); expect_long(0, "t5.3"); expect_long(0, "t5.4"); expect_long(4, "t5.5"); expect_long(0, "t5.6"); nr_frames = parse_long(); if (echo) printf("\nNr frames = %ld\n", nr_frames); page_set_nr_frames(page, nr_frames); swap_mode = 0; for (i = 0; i < nr_frames; i++) { frame_t *frame = new_frame(page, i); parse_frame(frame, texts); } if (echo) printf("\nEnd of frames\n"); organize_page(page); /* { int i; for (i = 0; i < page->nr_frames; i++) { printf("%4d %12ld %12ld %12ld %12ld %8ld %8ld", page->frames[i]->nr_in_file, page->frames[i]->box.x1, page->frames[i]->box.y1, page->frames[i]->box.x2, page->frames[i]->box.y2, page->frames[i]->box.x2 - page->frames[i]->box.x1, page->frames[i]->box.y2 - page->frames[i]->box.y1); print_sub_text_as_XML(&page->frames[i]->sub_text, stdout, 0, FALSE); } } */ } void print_page(page_t *page, FILE *f) { word i; bool left_side_page = FALSE; fprintf(f, "\n"); for (i = 0; i < page->nr_frames; i++) { frame_t *frame = page->frames[i]; if (frame->box.x2 < 70000000) left_side_page = TRUE; else if (left_side_page) { fprintf(f, "\n\n\n"); left_side_page = FALSE; } if (frame->parent == NULL) print_frame(f, frame, 2); } fprintf(f, "\n\n"); } void print_pages(page_t *pages, FILE *f) { page_t *page; fprintf(f, "\n"); FORALL_IN_LIST(page, pages) print_page(page, f); fprintf(f, "\n"); } void analyse_data(text_t *texts, FILE *fout) { page_t *pages = NULL; c = 0; dpos = 0L; echo = FALSE; echo_all = FALSE; translate = FALSE; if (echo) printf("length of data = %ld\n", dlen); { long i, j; #ifdef DUMP_FIRST_PART for (i = 0; i < 861; i++) { if (i % 16 == 0) printf( "%3d = ", i); printf("%02X ", data[i]); if (i % 16 == 15) printf("\n"); } #endif dpos = 861; i = dlword(dpos); dpos += i + 8; for (j = 0; j < 10 && dpos < dlen+4; j++) { lword len = parse_long(); lword end_dpos = dpos + len; if (echo) printf("\nat %ld len = %08lx (=%ld) \n", dpos, len, len); switch(j) { case 0: parse_font_families(end_dpos); break; case 1: parse_fonts(end_dpos); break; case 2: parse_colours(end_dpos); break; /* 3 */ case 4: parse_par_style_defs(end_dpos); break; /* 5 */ /* 6 */ case 7: parse_char_styles(end_dpos); break; case 8: parse_par_styles(end_dpos); break; /* 9 */ default: if (echo) for (i = 0; dpos < end_dpos; dpos++, i++) { if (i % 16 == 0) printf( "%3ld = ", i); printf(data[dpos] > ' ' && data[dpos] < 127 ? " %c " : "%02X ", data[dpos]); if (i % 16 == 15) printf("\n"); } break; } dpos = end_dpos; } } if (echo) { print_fonts(); print_par_style_def(); } echo = TRUE; translate = TRUE; printf("\n"); while (data[dpos+2] != 0) { page_t *page = new_page(&pages); parse_page(page, texts); } print_pages(pages, fout); { text_t *text; FORALL_IN_LIST(text, texts) { sub_text_t *sub_text; printf("text %7ld:", text->id); if (text->next) printf(" (diff %ld)", text->next->id - text->id); FORALL_IN_LIST(sub_text, text->sub_texts) printf(" %ld (%ld)", sub_text->frame->nr_in_file, sub_text->start); printf("\n"); } #ifdef RUBB for (i = 0; i < 800; i++) textids[i] = -1; FORALL_IN_LIST(page, pages) { for (j = 0; j < page->nr_frames; j++) textids[page->frames[j].nr_in_file] = page->frames[j].sub_text #endif } dump_context(dpos); } /****************************************/ int main(int argc, char *argv[]) { lword endtext; lword saveblockend; text_t *texts = NULL; data = (byte*)malloc(1000000); dlen = 0; if (argc != 2) { printf("%d, %s \n", argc, argv[0]); return 0; } CBuf input_buf; if (!input_buf.read_file(argv[1])) { printf("Cannot open %s\n", argv[1]); return 0; } init_input(); input.buf = input_buf.buf; input.length = input_buf.length; printf("Length = %ld bytes %ld.%ld\n", input.length, input.length / 256, input.length % 256); /* Try to find start of text */ { long nextblock; lword dfpos; { long i; for (i = 0; i < 3*256 - 4; i++) { data[dlen++] = input.buf[i]; DUMP_BLOCKS((/*input.buf[i] >= ' ' && input.buf[i] < 127 ? " %c " :*/ "%c%02x", input.buf[i] >= ' ' && input.buf[i] < 127 ? input.buf[i] : ' ', input.buf[i])); if ((i + 1) % 32 == 0) DUMP_BLOCKS(("\n")); } DUMP_BLOCKS(("\n")); } input.pos = 3*256 - 4; nextblock = input.read_lword(); for(;;) { lword curpos = input.pos; DUMP_SKIP(("nextblock = %lx\n", nextblock)); if (nextblock < 0) { word nrblocks; input.pos = (-nextblock) * 256 - 256; nrblocks = input.read_word(); dfpos = input.pos; input.pos -= 2; input.blockend = input.pos + 256 * nrblocks - 4; } else { input.pos = nextblock * 256 - 256; dfpos = input.pos; input.blockend = input.pos + 256 - 4; } DUMP_SKIP((" curpos = %lx, input.pos = %lx, blockend = %lx\n", curpos, input.pos, input.blockend)); { long i; for (i = dfpos; i < input.blockend; i++) { data[dlen++] = input.buf[i]; DUMP_BLOCKS((input.buf[i] >= ' ' && input.buf[i] < 127 ? " %c " : "%02x ", input.buf[i])); if ((i + 1) % 32 == 0) DUMP_BLOCKS(("\n")); } DUMP_BLOCKS(("\n")); } if (input.pos != curpos) { endtext = input.pos; saveblockend = input.blockend; input.pos = curpos; break; } input.pos = input.blockend; nextblock = input.read_lword(); } } #ifdef READ_TEXT texts = read_texts(input.pos, endtext); /*print_texts_as_XML(texts, out_file, FALSE);*/ #endif /* Dump the rest */ { long nextblock; lword dfpos; input.pos = saveblockend; nextblock = input.read_lword(); for(;input.pos < input.length;) { lword curpos = input.pos; DUMP_SKIP(("nextblock = %lx\n", nextblock)); if (nextblock < 0) { word nrblocks; input.pos = (-nextblock) * 256 - 256; nrblocks = input.read_word(); dfpos = input.pos; input.pos -= 2; input.blockend = input.pos + 256 * nrblocks - 4; } else { input.pos = nextblock * 256 - 256; dfpos = input.pos; input.blockend = input.pos + 256 - 4; } DUMP_SKIP((" curpos = %lx, input.pos = %lx, blockend = %lx\n", curpos, input.pos, input.blockend)); { long i; for (i = dfpos; i < input.blockend; i++) { data[dlen++] = input.buf[i]; DUMP_BLOCKS(("%02x ", input.buf[i])); if ((i + 1) % 32 == 0) DUMP_BLOCKS(("\n")); } DUMP_BLOCKS(("\n")); } if (input.pos != curpos) { DUMP_SKIP(("***JUMP***\n")); } input.pos = input.blockend; nextblock = input.read_lword(); } } { char outfn[300]; char *s; FILE *fout = NULL; strcpy(outfn, argv[1]); for (s = outfn + strlen(outfn); s > outfn && *s != '.'; s--); if (*s == '.') strcpy(s, "_s1.xml"); else exit(1); fout = fopen(outfn, "w"); if (fout == NULL) fout = stdout; analyse_data(texts, fout); } fflush(stdout); return 0; }