filters

TextOutputDev.cc

00001 //========================================================================
00002 //
00003 // TextOutputDev.cc
00004 //
00005 // Copyright 1997-2002 Glyph & Cog, LLC
00006 //
00007 //========================================================================
00008 
00009 #include <aconf.h>
00010 
00011 #ifdef USE_GCC_PRAGMAS
00012 #pragma implementation
00013 #endif
00014 
00015 #include <stdio.h>
00016 #include <stdlib.h>
00017 #include <stddef.h>
00018 #include <math.h>
00019 #include <ctype.h>
00020 #include "GString.h"
00021 #include "gmem.h"
00022 #include "config.h"
00023 #include "Error.h"
00024 #include "GlobalParams.h"
00025 #include "UnicodeMap.h"
00026 #include "GfxState.h"
00027 #include "TextOutputDev.h"
00028 
00029 #ifdef MACOS
00030 // needed for setting type/creator of MacOS files
00031 #include "ICSupport.h"
00032 #endif
00033 
00034 //------------------------------------------------------------------------
00035 
00036 #define textOutSpace    0.2
00037 #define textOutColSpace 0.2
00038 
00039 //------------------------------------------------------------------------
00040 
00041 struct TextOutColumnEdge {
00042   double x, y0, y1;
00043 };
00044 
00045 //------------------------------------------------------------------------
00046 // TextBlock
00047 //------------------------------------------------------------------------
00048 
00049 TextBlock::TextBlock() {
00050   strings = NULL;
00051   next = NULL;
00052   xyNext = NULL;
00053   text = NULL;
00054   xRight = NULL;
00055   col = NULL;
00056 }
00057 
00058 TextBlock::~TextBlock() {
00059   TextString *p1, *p2;
00060 
00061   for (p1 = strings; p1; p1 = p2) {
00062     p2 = p1->next;
00063     delete p1;
00064   }
00065   gfree(text);
00066   gfree(xRight);
00067   gfree(col);
00068 }
00069 
00070 //------------------------------------------------------------------------
00071 // TextLine
00072 //------------------------------------------------------------------------
00073 
00074 TextLine::TextLine() {
00075   blocks = NULL;
00076   next = NULL;
00077 }
00078 
00079 TextLine::~TextLine() {
00080   TextBlock *p1, *p2;
00081 
00082   for (p1 = blocks; p1; p1 = p2) {
00083     p2 = p1->next;
00084     delete p1;
00085   }
00086 }
00087 
00088 //------------------------------------------------------------------------
00089 // TextString
00090 //------------------------------------------------------------------------
00091 
00092 TextString::TextString(GfxState *state, double x0, double y0,
00093                double fontSize) {
00094   GfxFont *font;
00095   double x, y;
00096 
00097   state->transform(x0, y0, &x, &y);
00098   if ((font = state->getFont())) {
00099     yMin = y - font->getAscent() * fontSize;
00100     yMax = y - font->getDescent() * fontSize;
00101   } else {
00102     // this means that the PDF file draws text without a current font,
00103     // which should never happen
00104     yMin = y - 0.95 * fontSize;
00105     yMax = y + 0.35 * fontSize;
00106   }
00107   if (yMin == yMax) {
00108     // this is a sanity check for a case that shouldn't happen -- but
00109     // if it does happen, we want to avoid dividing by zero later
00110     yMin = y;
00111     yMax = y + 1;
00112   }
00113   marked = gFalse;
00114   text = NULL;
00115   xRight = NULL;
00116   len = size = 0;
00117   next = NULL;
00118 }
00119 
00120 
00121 TextString::~TextString() {
00122   gfree(text);
00123   gfree(xRight);
00124 }
00125 
00126 void TextString::addChar(GfxState */*state*/, double x, double /*y*/,
00127                          double dx, double /*dy*/, Unicode u) {
00128   if (len == size) {
00129     size += 16;
00130     text = (Unicode *)grealloc(text, size * sizeof(Unicode));
00131     xRight = (double *)grealloc(xRight, size * sizeof(double));
00132   }
00133   text[len] = u;
00134   if (len == 0) {
00135     xMin = x;
00136   }
00137   xMax = xRight[len] = x + dx;
00138   ++len;
00139 }
00140 
00141 //------------------------------------------------------------------------
00142 // TextPage
00143 //------------------------------------------------------------------------
00144 
00145 TextPage::TextPage(GBool rawOrderA) {
00146   rawOrder = rawOrderA;
00147   curStr = NULL;
00148   fontSize = 0;
00149   xyStrings = NULL;
00150   xyCur1 = xyCur2 = NULL;
00151   lines = NULL;
00152   nest = 0;
00153   nTinyChars = 0;
00154 }
00155 
00156 TextPage::~TextPage() {
00157   clear();
00158 }
00159 
00160 void TextPage::updateFont(GfxState *state) {
00161   GfxFont *font;
00162   double *fm;
00163   char *name;
00164   int code, mCode, letterCode, anyCode;
00165   double w;
00166 
00167   // adjust the font size
00168   fontSize = state->getTransformedFontSize();
00169   if ((font = state->getFont()) && font->getType() == fontType3) {
00170     // This is a hack which makes it possible to deal with some Type 3
00171     // fonts.  The problem is that it's impossible to know what the
00172     // base coordinate system used in the font is without actually
00173     // rendering the font.  This code tries to guess by looking at the
00174     // width of the character 'm' (which breaks if the font is a
00175     // subset that doesn't contain 'm').
00176     mCode = letterCode = anyCode = -1;
00177     for (code = 0; code < 256; ++code) {
00178       name = ((Gfx8BitFont *)font)->getCharName(code);
00179       if (name && name[0] == 'm' && name[1] == '\0') {
00180     mCode = code;
00181       }
00182       if (letterCode < 0 && name && name[1] == '\0' &&
00183       ((name[0] >= 'A' && name[0] <= 'Z') ||
00184        (name[0] >= 'a' && name[0] <= 'z'))) {
00185     letterCode = code;
00186       }
00187       if (anyCode < 0 && name && ((Gfx8BitFont *)font)->getWidth(code) > 0) {
00188     anyCode = code;
00189       }
00190     }
00191     if (mCode >= 0 &&
00192     (w = ((Gfx8BitFont *)font)->getWidth(mCode)) > 0) {
00193       // 0.6 is a generic average 'm' width -- yes, this is a hack
00194       fontSize *= w / 0.6;
00195     } else if (letterCode >= 0 &&
00196            (w = ((Gfx8BitFont *)font)->getWidth(letterCode)) > 0) {
00197       // even more of a hack: 0.5 is a generic letter width
00198       fontSize *= w / 0.5;
00199     } else if (anyCode >= 0 &&
00200            (w = ((Gfx8BitFont *)font)->getWidth(anyCode)) > 0) {
00201       // better than nothing: 0.5 is a generic character width
00202       fontSize *= w / 0.5;
00203     }
00204     fm = font->getFontMatrix();
00205     if (fm[0] != 0) {
00206       fontSize *= fabs(fm[3] / fm[0]);
00207     }
00208   }
00209 }
00210 
00211 void TextPage::beginString(GfxState *state, double x0, double y0) {
00212   // This check is needed because Type 3 characters can contain
00213   // text-drawing operations.
00214   if (curStr) {
00215     ++nest;
00216     return;
00217   }
00218 
00219   curStr = new TextString(state, x0, y0, fontSize);
00220 }
00221 
00222 void TextPage::addChar(GfxState *state, double x, double y,
00223                double dx, double dy, Unicode *u, int uLen) {
00224   double x1, y1, w1, h1, dx2, dy2;
00225   int n, i;
00226 
00227   state->transform(x, y, &x1, &y1);
00228   if (x1 < 0 || x1 > state->getPageWidth() ||
00229       y1 < 0 || y1 > state->getPageHeight()) {
00230     return;
00231   }
00232   state->textTransformDelta(state->getCharSpace() * state->getHorizScaling(),
00233                 0, &dx2, &dy2);
00234   dx -= dx2;
00235   dy -= dy2;
00236   state->transformDelta(dx, dy, &w1, &h1);
00237   if (!globalParams->getTextKeepTinyChars() &&
00238       fabs(w1) < 3 && fabs(h1) < 3) {
00239     if (++nTinyChars > 20000) {
00240       return;
00241     }
00242   }
00243   n = curStr->len;
00244   if (n > 0 && x1 - curStr->xRight[n-1] >
00245                0.1 * (curStr->yMax - curStr->yMin)) {
00246     // large char spacing is sometimes used to move text around
00247     endString();
00248     beginString(state, x, y);
00249   }
00250   if (uLen == 1 && u[0] == (Unicode)0x20 &&
00251       w1 > 0.5 * (curStr->yMax - curStr->yMin)) {
00252     // large word spacing is sometimes used to move text around
00253     return;
00254   }
00255   if (uLen != 0) {
00256     w1 /= uLen;
00257     h1 /= uLen;
00258   }
00259   for (i = 0; i < uLen; ++i) {
00260     curStr->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]);
00261   }
00262 }
00263 
00264 void TextPage::endString() {
00265   // This check is needed because Type 3 characters can contain
00266   // text-drawing operations.
00267   if (nest > 0) {
00268     --nest;
00269     return;
00270   }
00271 
00272   addString(curStr);
00273   curStr = NULL;
00274 }
00275 
00276 void TextPage::addString(TextString *str) {
00277   TextString *p1, *p2;
00278 
00279   // throw away zero-length strings -- they don't have valid xMin/xMax
00280   // values, and they're useless anyway
00281   if (str->len == 0) {
00282     delete str;
00283     return;
00284   }
00285 
00286   // insert string in xy list
00287   if (rawOrder) {
00288     p1 = xyCur1;
00289     p2 = NULL;
00290   } else if ((!xyCur1 || xyBefore(xyCur1, str)) &&
00291          (!xyCur2 || xyBefore(str, xyCur2))) {
00292     p1 = xyCur1;
00293     p2 = xyCur2;
00294   } else if (xyCur1 && xyBefore(xyCur1, str)) {
00295     for (p1 = xyCur1, p2 = xyCur2; p2; p1 = p2, p2 = p2->next) {
00296       if (xyBefore(str, p2)) {
00297     break;
00298       }
00299     }
00300     xyCur2 = p2;
00301   } else {
00302     for (p1 = NULL, p2 = xyStrings; p2; p1 = p2, p2 = p2->next) {
00303       if (xyBefore(str, p2)) {
00304     break;
00305       }
00306     }
00307     xyCur2 = p2;
00308   }
00309   xyCur1 = str;
00310   if (p1) {
00311     p1->next = str;
00312   } else {
00313     xyStrings = str;
00314   }
00315   str->next = p2;
00316 }
00317 
00318 void TextPage::coalesce() {
00319   TextLine *line, *line0;
00320   TextBlock *yxBlocks, *xyBlocks, *blk, *blk0, *blk1, *blk2;
00321   TextString *str0, *str1, *str2, *str3, *str4;
00322   TextString *str1prev, *str2prev, *str3prev;
00323   TextOutColumnEdge *edges;
00324   UnicodeMap *uMap;
00325   GBool isUnicode;
00326   char buf[8];
00327   int edgesLength, edgesSize;
00328   double x, yMin, yMax;
00329   double space, fit1, fit2, h;
00330   int col1, col2, d;
00331   int i, j;
00332 
00333 #if 0 //~ for debugging
00334   for (str1 = xyStrings; str1; str1 = str1->next) {
00335     printf("x=%.2f..%.2f  y=%.2f..%.2f  size=%.2f '",
00336        str1->xMin, str1->xMax, str1->yMin, str1->yMax,
00337        (str1->yMax - str1->yMin));
00338     for (i = 0; i < str1->len; ++i) {
00339       fputc(str1->text[i] & 0xff, stdout);
00340     }
00341     printf("'\n");
00342   }
00343   printf("\n------------------------------------------------------------\n\n");
00344 #endif
00345 
00346   // build the list of column edges
00347   edges = NULL;
00348   edgesLength = edgesSize = 0;
00349   if (!rawOrder) {
00350     for (str1prev = NULL, str1 = xyStrings;
00351      str1;
00352      str1prev = str1, str1 = str1->next) {
00353       if (str1->marked) {
00354     continue;
00355       }
00356       h = str1->yMax - str1->yMin;
00357       if (str1prev && (str1->xMin - str1prev->xMax) / h < textOutColSpace) {
00358     continue;
00359       }
00360       x = str1->xMin;
00361       yMin = str1->yMin;
00362       yMax = str1->yMax;
00363       for (str2prev = str1, str2 = str1->next;
00364        str2;
00365        str2prev = str2, str2 = str2->next) {
00366     h = str2->yMax - str2->yMin;
00367     if (!str2->marked &&
00368         (str2->xMin - str2prev->xMax) / h > textOutColSpace &&
00369         fabs(str2->xMin - x) < 0.5 &&
00370         str2->yMin - yMax < 0.3 * h &&
00371         yMin - str2->yMax < 0.3 * h) {
00372       break;
00373     }
00374       }
00375       if (str2) {
00376     if (str2->yMin < yMin) {
00377       yMin = str2->yMin;
00378     }
00379     if (str2->yMax > yMax) {
00380       yMax = str2->yMax;
00381     }
00382     str2->marked = gTrue;
00383     for (str3prev = str1, str3 = str1->next;
00384          str3;
00385          str3prev = str3, str3 = str3->next) {
00386       h = str3->yMax - str3->yMin;
00387       if (!str3->marked &&
00388           (str3->xMin - str3prev->xMax) / h > textOutColSpace &&
00389           fabs(str3->xMin - x) < 0.5 &&
00390           str3->yMin - yMax < 0.3 * h &&
00391           yMin - str3->yMax < 0.3 * h) {
00392         break;
00393       }
00394     }
00395     if (str3) {
00396       if (str3->yMin < yMin) {
00397         yMin = str3->yMin;
00398       }
00399       if (str3->yMax > yMax) {
00400         yMax = str3->yMax;
00401       }
00402       str3->marked = gTrue;
00403       do {
00404         for (str2prev = str1, str2 = str1->next;
00405          str2;
00406          str2prev = str2, str2 = str2->next) {
00407           h = str2->yMax - str2->yMin;
00408           if (!str2->marked &&
00409           (str2->xMin - str2prev->xMax) / h > textOutColSpace &&
00410           fabs(str2->xMin - x) < 0.5 &&
00411           str2->yMin - yMax < 0.3 * h &&
00412           yMin - str2->yMax < 0.3 * h) {
00413         if (str2->yMin < yMin) {
00414           yMin = str2->yMin;
00415         }
00416         if (str2->yMax > yMax) {
00417           yMax = str2->yMax;
00418         }
00419         str2->marked = gTrue;
00420         break;
00421           }
00422         }
00423       } while (str2);
00424       if (edgesLength == edgesSize) {
00425         edgesSize = edgesSize ? 2 * edgesSize : 16;
00426         edges = (TextOutColumnEdge *)
00427           grealloc(edges, edgesSize * sizeof(TextOutColumnEdge));
00428       }
00429       edges[edgesLength].x = x;
00430       edges[edgesLength].y0 = yMin;
00431       edges[edgesLength].y1 = yMax;
00432       ++edgesLength;
00433     } else {
00434       str2->marked = gFalse;
00435     }
00436       }
00437       str1->marked = gTrue;
00438     }
00439   }
00440 
00441 #if 0 //~ for debugging
00442   printf("column edges:\n");
00443   for (i = 0; i < edgesLength; ++i) {
00444     printf("%d: x=%.2f y0=%.2f y1=%.2f\n",
00445        i, edges[i].x, edges[i].y0, edges[i].y1);
00446   }
00447   printf("\n------------------------------------------------------------\n\n");
00448 #endif
00449 
00450   // build the blocks
00451   yxBlocks = NULL;
00452   blk1 = blk2 = NULL;
00453   while (xyStrings) {
00454 
00455     // build the block
00456     str0 = xyStrings;
00457     xyStrings = xyStrings->next;
00458     str0->next = NULL;
00459     blk = new TextBlock();
00460     blk->strings = str0;
00461     blk->xMin = str0->xMin;
00462     blk->xMax = str0->xMax;
00463     blk->yMin = str0->yMin;
00464     blk->yMax = str0->yMax;
00465     while (xyStrings) {
00466       str1 = NULL;
00467       str2 = xyStrings;
00468       fit1 = coalesceFit(str0, str2);
00469       if (!rawOrder) {
00470     // look for best-fitting string
00471     space = str0->yMax - str0->yMin;
00472     for (str3 = xyStrings, str4 = xyStrings->next;
00473          str4 && str4->xMin - str0->xMax <= space;
00474          str3 = str4, str4 = str4->next) {
00475       fit2 = coalesceFit(str0, str4);
00476       if (fit2 < fit1) {
00477         str1 = str3;
00478         str2 = str4;
00479         fit1 = fit2;
00480       }
00481     }
00482       }
00483       if (fit1 > 1) {
00484     // no fit - we're done with this block
00485     break;
00486       }
00487 
00488       // if we've hit a column edge we're done with this block
00489       if (fit1 > 0.2) {
00490     for (i = 0; i < edgesLength; ++i) {
00491       if (str0->xMax < edges[i].x + 0.5 && edges[i].x - 0.5 < str2->xMin &&
00492           str0->yMin < edges[i].y1 && str0->yMax > edges[i].y0 &&
00493           str2->yMin < edges[i].y1 && str2->yMax > edges[i].y0) {
00494         break;
00495       }
00496     }
00497     if (i < edgesLength) {
00498       break;
00499     }
00500       }
00501 
00502       if (str1) {
00503     str1->next = str2->next;
00504       } else {
00505     xyStrings = str2->next;
00506       }
00507       str0->next = str2;
00508       str2->next = NULL;
00509       if (str2->xMax > blk->xMax) {
00510     blk->xMax = str2->xMax;
00511       }
00512       if (str2->yMin < blk->yMin) {
00513     blk->yMin = str2->yMin;
00514       }
00515       if (str2->yMax > blk->yMax) {
00516     blk->yMax = str2->yMax;
00517       }
00518       str0 = str2;
00519     }
00520 
00521     // insert block on list
00522     if (!rawOrder) {
00523       // insert block on list in yx order
00524       for (blk1 = NULL, blk2 = yxBlocks;
00525        blk2 && !yxBefore(blk, blk2);
00526        blk1 = blk2, blk2 = blk2->next) ;
00527     }
00528     blk->next = blk2;
00529     if (blk1) {
00530       blk1->next = blk;
00531     } else {
00532       yxBlocks = blk;
00533     }
00534     blk1 = blk;
00535   }
00536 
00537   gfree(edges);
00538 
00539   // the strings are now owned by the lines/blocks tree
00540   xyStrings = NULL;
00541 
00542   // build the block text
00543   uMap = globalParams->getTextEncoding();
00544   isUnicode = uMap ? uMap->isUnicode() : gFalse;
00545   for (blk = yxBlocks; blk; blk = blk->next) {
00546     blk->len = 0;
00547     for (str1 = blk->strings; str1; str1 = str1->next) {
00548       blk->len += str1->len;
00549       if (str1->next && str1->next->xMin - str1->xMax >
00550                     textOutSpace * (str1->yMax - str1->yMin)) {
00551     str1->spaceAfter = gTrue;
00552     ++blk->len;
00553       } else {
00554     str1->spaceAfter = gFalse;
00555       }
00556     }
00557     blk->text = (Unicode *)gmalloc(blk->len * sizeof(Unicode));
00558     blk->xRight = (double *)gmalloc(blk->len * sizeof(double));
00559     blk->col = (int *)gmalloc(blk->len * sizeof(int));
00560     i = 0;
00561     for (str1 = blk->strings; str1; str1 = str1->next) {
00562       for (j = 0; j < str1->len; ++j) {
00563     blk->text[i] = str1->text[j];
00564     blk->xRight[i] = str1->xRight[j];
00565     ++i;
00566       }
00567       if (str1->spaceAfter) {
00568     blk->text[i] = (Unicode)0x0020;
00569     blk->xRight[i] = str1->next->xMin;
00570     ++i;
00571       }
00572     }
00573     blk->convertedLen = 0;
00574     for (j = 0; j < blk->len; ++j) {
00575       blk->col[j] = blk->convertedLen;
00576       if (isUnicode) {
00577     ++blk->convertedLen;
00578       } else if (uMap) {
00579     blk->convertedLen += uMap->mapUnicode(blk->text[j], buf, sizeof(buf));
00580       }
00581     }
00582   }
00583   if (uMap) {
00584     uMap->decRefCnt();
00585   }
00586 
00587 #if 0 //~ for debugging
00588   for (blk = yxBlocks; blk; blk = blk->next) {
00589     printf("[block: x=%.2f..%.2f y=%.2f..%.2f len=%d]\n",
00590        blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->len);
00591     TextString *str;
00592     for (str = blk->strings; str; str = str->next) {
00593       printf("    x=%.2f..%.2f  y=%.2f..%.2f  size=%.2f'",
00594          str->xMin, str->xMax, str->yMin, str->yMax,
00595          (str->yMax - str->yMin));
00596       for (i = 0; i < str->len; ++i) {
00597     fputc(str->text[i] & 0xff, stdout);
00598       }
00599       if (str->spaceAfter) {
00600     fputc(' ', stdout);
00601       }
00602       printf("'\n");
00603     }
00604   }
00605   printf("\n------------------------------------------------------------\n\n");
00606 #endif
00607 
00608   // build the lines
00609   lines = NULL;
00610   line0 = NULL;
00611   while (yxBlocks) {
00612     blk0 = yxBlocks;
00613     yxBlocks = yxBlocks->next;
00614     blk0->next = NULL;
00615     line = new TextLine();
00616     line->blocks = blk0;
00617     line->yMin = blk0->yMin;
00618     line->yMax = blk0->yMax;
00619     while (yxBlocks) {
00620 
00621       // remove duplicated text (fake boldface, shadowed text)
00622       h = blk0->yMax - blk0->yMin;
00623       if (yxBlocks->len == blk0->len &&
00624       !memcmp(yxBlocks->text, blk0->text,
00625           yxBlocks->len * sizeof(Unicode)) &&
00626       fabs(yxBlocks->yMin - blk0->yMin) / h < 0.2 &&
00627       fabs(yxBlocks->yMax - blk0->yMax) / h < 0.2 &&
00628       fabs(yxBlocks->xMin - blk0->xMin) / h < 0.2 &&
00629       fabs(yxBlocks->xMax - blk0->xMax) / h < 0.2) {
00630     blk1 = yxBlocks;
00631     yxBlocks = yxBlocks->next;
00632     delete blk1;
00633     continue;
00634       }
00635 
00636       if (rawOrder && yxBlocks->yMax < blk0->yMin) {
00637     break;
00638       }
00639       if (yxBlocks->yMin > 0.2*blk0->yMin + 0.8*blk0->yMax ||
00640       yxBlocks->xMin < blk0->xMax) {
00641     break;
00642       }
00643       blk1 = yxBlocks;
00644       yxBlocks = yxBlocks->next;
00645       blk0->next = blk1;
00646       blk1->next = NULL;
00647       if (blk1->yMin < line->yMin) {
00648     line->yMin = blk1->yMin;
00649       }
00650       if (blk1->yMax > line->yMax) {
00651     line->yMax = blk1->yMax;
00652       }
00653       blk0 = blk1;
00654     }
00655     if (line0) {
00656       line0->next = line;
00657     } else {
00658       lines = line;
00659     }
00660     line->next = NULL;
00661     line0 = line;
00662   }
00663 
00664 
00665   // sort the blocks into xy order
00666   xyBlocks = NULL;
00667   for (line = lines; line; line = line->next) {
00668     for (blk = line->blocks; blk; blk = blk->next) {
00669       for (blk1 = NULL, blk2 = xyBlocks;
00670        blk2 && !xyBefore(blk, blk2);
00671        blk1 = blk2, blk2 = blk2->xyNext) ;
00672       blk->xyNext = blk2;
00673       if (blk1) {
00674     blk1->xyNext = blk;
00675       } else {
00676     xyBlocks = blk;
00677       }
00678     }
00679   }
00680 
00681 #if 0 //~ for debugging
00682   for (blk = xyBlocks; blk; blk = blk->xyNext) {
00683     printf("[block: x=%.2f..%.2f y=%.2f..%.2f len=%d]\n",
00684        blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->len);
00685     TextString *str;
00686     for (str = blk->strings; str; str = str->next) {
00687       printf("    x=%.2f..%.2f  y=%.2f..%.2f  size=%.2f '",
00688          str->xMin, str->xMax, str->yMin, str->yMax,
00689          (str->yMax - str->yMin));
00690       for (i = 0; i < str->len; ++i) {
00691     fputc(str->text[i] & 0xff, stdout);
00692       }
00693       printf("'\n");
00694     }
00695   }
00696   printf("\n------------------------------------------------------------\n\n");
00697 #endif
00698 
00699   // do column assignment
00700   for (blk1 = xyBlocks; blk1; blk1 = blk1->xyNext) {
00701     col1 = 0;
00702     for (blk2 = xyBlocks; blk2 != blk1; blk2 = blk2->xyNext) {
00703       if (blk1->xMin >= blk2->xMax) {
00704     d = (int)((blk1->xMin - blk2->xMax) /
00705           (0.4 * (blk1->yMax - blk1->yMin)));
00706     if (d > 4) {
00707       d = 4;
00708     }
00709     col2 = blk2->col[0] + blk2->convertedLen + d;
00710     if (col2 > col1) {
00711       col1 = col2;
00712     }
00713       } else if (blk1->xMin > blk2->xMin) {
00714     for (i = 0; i < blk2->len && blk1->xMin >= blk2->xRight[i]; ++i) ;
00715     col2 = blk2->col[i];
00716     if (col2 > col1) {
00717       col1 = col2;
00718     }
00719       }
00720     }
00721     for (j = 0; j < blk1->len; ++j) {
00722       blk1->col[j] += col1;
00723     }
00724   }
00725 
00726 #if 0 //~ for debugging
00727   for (line = lines; line; line = line->next) {
00728     printf("[line]\n");
00729     for (blk = line->blocks; blk; blk = blk->next) {
00730       printf("[block: col=%d, len=%d]\n", blk->col[0], blk->len);
00731       TextString *str;
00732       for (str = blk->strings; str; str = str->next) {
00733     printf("    x=%.2f..%.2f  y=%.2f..%.2f  size=%.2f '",
00734            str->xMin, str->xMax, str->yMin, str->yMax,
00735            (str->yMax - str->yMin));
00736     for (i = 0; i < str->len; ++i) {
00737       fputc(str->text[i] & 0xff, stdout);
00738     }
00739     if (str->spaceAfter) {
00740       printf(" [space]\n");
00741     }
00742     printf("'\n");
00743       }
00744     }
00745   }
00746   printf("\n------------------------------------------------------------\n\n");
00747 #endif
00748 }
00749 
00750 
00751 GBool TextPage::findText(Unicode *s, int len,
00752              GBool top, GBool bottom,
00753              double *xMin, double *yMin,
00754              double *xMax, double *yMax) {
00755   TextLine *line;
00756   TextBlock *blk;
00757   Unicode *p;
00758   Unicode u1, u2;
00759   int m, i, j;
00760   double x0, x1, x;
00761 
00762   // scan all blocks on page
00763   for (line = lines; line; line = line->next) {
00764     for (blk = line->blocks; blk; blk = blk->next) {
00765 
00766       // check: above top limit?
00767       if (!top && (blk->yMax < *yMin ||
00768            (blk->yMin < *yMin && blk->xMax <= *xMin))) {
00769     continue;
00770       }
00771 
00772       // check: below bottom limit?
00773       if (!bottom && (blk->yMin > *yMax ||
00774               (blk->yMax > *yMax && blk->xMin >= *xMax))) {
00775     return gFalse;
00776       }
00777 
00778       // search each position in this block
00779       m = blk->len;
00780       for (i = 0, p = blk->text; i <= m - len; ++i, ++p) {
00781 
00782     x0 = (i == 0) ? blk->xMin : blk->xRight[i-1];
00783     x1 = blk->xRight[i];
00784     x = 0.5 * (x0 + x1);
00785 
00786     // check: above top limit?
00787     if (!top && blk->yMin < *yMin) {
00788       if (x < *xMin) {
00789         continue;
00790       }
00791     }
00792 
00793     // check: below bottom limit?
00794     if (!bottom && blk->yMax > *yMax) {
00795       if (x > *xMax) {
00796         return gFalse;
00797       }
00798     }
00799 
00800     // compare the strings
00801     for (j = 0; j < len; ++j) {
00802 #if 1 //~ this lowercases Latin A-Z only -- this will eventually be
00803       //~ extended to handle other character sets
00804       if (p[j] >= 0x41 && p[j] <= 0x5a) {
00805         u1 = p[j] + 0x20;
00806       } else {
00807         u1 = p[j];
00808       }
00809       if (s[j] >= 0x41 && s[j] <= 0x5a) {
00810         u2 = s[j] + 0x20;
00811       } else {
00812         u2 = s[j];
00813       }
00814 #endif
00815       if (u1 != u2) {
00816         break;
00817       }
00818     }
00819 
00820     // found it
00821     if (j == len) {
00822       *xMin = x0;
00823       *xMax = blk->xRight[i + len - 1];
00824       *yMin = blk->yMin;
00825       *yMax = blk->yMax;
00826       return gTrue;
00827     }
00828       }
00829     }
00830   }
00831 
00832   return gFalse;
00833 }
00834 
00835 GString *TextPage::getText(double xMin, double yMin,
00836                double xMax, double yMax) {
00837   GString *s;
00838   UnicodeMap *uMap;
00839   GBool isUnicode;
00840   char space[8], eol[16], buf[8];
00841   int spaceLen, eolLen, len;
00842   TextLine *line;
00843   TextBlock *blk;
00844   double x0, x1, y;
00845   int firstCol, col, i;
00846   GBool multiLine;
00847 
00848   s = new GString();
00849 
00850   // get the output encoding
00851   if (!(uMap = globalParams->getTextEncoding())) {
00852     return s;
00853   }
00854   isUnicode = uMap->isUnicode();
00855   spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
00856   eolLen = 0; // make gcc happy
00857   switch (globalParams->getTextEOL()) {
00858   case eolUnix:
00859     eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
00860     break;
00861   case eolDOS:
00862     eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
00863     eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
00864     break;
00865   case eolMac:
00866     eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
00867     break;
00868   }
00869 
00870   // find the leftmost column
00871   multiLine = gFalse;
00872   firstCol = -1;
00873   for (line = lines; line; line = line->next) {
00874     if (line->yMin > yMax) {
00875       break;
00876     }
00877     if (line->yMax < yMin) {
00878       continue;
00879     }
00880 
00881     for (blk = line->blocks; blk && blk->xMax < xMin; blk = blk->next) ;
00882     if (!blk || blk->xMin > xMax) {
00883       continue;
00884     }
00885 
00886     y = 0.5 * (blk->yMin + blk->yMax);
00887     if (y < yMin || y > yMax) {
00888       continue;
00889     }
00890 
00891     if (firstCol >= 0) {
00892       multiLine = gTrue;
00893     }
00894 
00895     i = 0;
00896     while (1) {
00897       x0 = (i==0) ? blk->xMin : blk->xRight[i-1];
00898       x1 = blk->xRight[i];
00899       if (0.5 * (x0 + x1) > xMin) {
00900     break;
00901       }
00902       ++i;
00903     }
00904     col = blk->col[i];
00905 
00906     if (firstCol < 0 || col < firstCol) {
00907       firstCol = col;
00908     }
00909   }
00910 
00911   // extract the text
00912   for (line = lines; line; line = line->next) {
00913     if (line->yMin > yMax) {
00914       break;
00915     }
00916     if (line->yMax < yMin) {
00917       continue;
00918     }
00919 
00920     for (blk = line->blocks; blk && blk->xMax < xMin; blk = blk->next) ;
00921     if (!blk || blk->xMin > xMax) {
00922       continue;
00923     }
00924 
00925     y = 0.5 * (blk->yMin + blk->yMax);
00926     if (y < yMin || y > yMax) {
00927       continue;
00928     }
00929 
00930     i = 0;
00931     while (1) {
00932       x0 = (i==0) ? blk->xMin : blk->xRight[i-1];
00933       x1 = blk->xRight[i];
00934       if (0.5 * (x0 + x1) > xMin) {
00935     break;
00936       }
00937       ++i;
00938     }
00939 
00940     col = firstCol;
00941 
00942     do {
00943 
00944       // line this block up with the correct column
00945       for (; col < blk->col[i]; ++col) {
00946     s->append(space, spaceLen);
00947       }
00948 
00949       // print the block
00950       for (; i < blk->len; ++i) {
00951 
00952     x0 = (i==0) ? blk->xMin : blk->xRight[i-1];
00953     x1 = blk->xRight[i];
00954     if (0.5 * (x0 + x1) > xMax) {
00955       break;
00956     }
00957 
00958     len = uMap->mapUnicode(blk->text[i], buf, sizeof(buf));
00959     s->append(buf, len);
00960     col += isUnicode ? 1 : len;
00961       }
00962       if (i < blk->len) {
00963     break;
00964       }
00965 
00966       // next block
00967       blk = blk->next;
00968       i = 0;
00969 
00970     } while (blk && blk->xMin < xMax);
00971 
00972     if (multiLine) {
00973       s->append(eol, eolLen);
00974     }
00975   }
00976 
00977   uMap->decRefCnt();
00978 
00979   return s;
00980 }
00981 
00982 void TextPage::dump(void *outputStream, TextOutputFunc outputFunc) {
00983   UnicodeMap *uMap;
00984   char space[8], eol[16], eop[8], buf[8];
00985   int spaceLen, eolLen, eopLen, len;
00986   TextLine *line;
00987   TextBlock *blk;
00988   int col, d, i;
00989 
00990   // get the output encoding
00991   if (!(uMap = globalParams->getTextEncoding())) {
00992     return;
00993   }
00994   spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
00995   eolLen = 0; // make gcc happy
00996   switch (globalParams->getTextEOL()) {
00997   case eolUnix:
00998     eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
00999     break;
01000   case eolDOS:
01001     eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
01002     eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
01003     break;
01004   case eolMac:
01005     eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
01006     break;
01007   }
01008   eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop));
01009 
01010   // output
01011   for (line = lines; line; line = line->next) {
01012     col = 0;
01013     for (blk = line->blocks; blk; blk = blk->next) {
01014 
01015       // line this block up with the correct column
01016       if (rawOrder && col == 0) {
01017     col = blk->col[0];
01018       } else {
01019     for (; col < blk->col[0]; ++col) {
01020       (*outputFunc)(outputStream, space, spaceLen);
01021     }
01022       }
01023 
01024       // print the block
01025       for (i = 0; i < blk->len; ++i) {
01026     len = uMap->mapUnicode(blk->text[i], buf, sizeof(buf));
01027     (*outputFunc)(outputStream, buf, len);
01028       }
01029       col += blk->convertedLen;
01030     }
01031 
01032     // print a return
01033     (*outputFunc)(outputStream, eol, eolLen);
01034 
01035     // print extra vertical space if necessary
01036     if (line->next) {
01037       d = (int)((line->next->yMin - line->yMax) /
01038         (line->blocks->strings->yMax - lines->blocks->strings->yMin)
01039         + 0.5);
01040       // various things (weird font matrices) can result in bogus
01041       // values here, so do a sanity check
01042       if (rawOrder && d > 2) {
01043     d = 2;
01044       } else if (!rawOrder && d > 5) {
01045     d = 5;
01046       }
01047       for (; d > 0; --d) {
01048     (*outputFunc)(outputStream, eol, eolLen);
01049       }
01050     }
01051   }
01052 
01053   // end of page
01054   (*outputFunc)(outputStream, eol, eolLen);
01055   (*outputFunc)(outputStream, eop, eopLen);
01056   (*outputFunc)(outputStream, eol, eolLen);
01057 
01058   uMap->decRefCnt();
01059 }
01060 
01061 // Returns true if <str1> should be inserted before <str2> in xy
01062 // order.
01063 GBool TextPage::xyBefore(TextString *str1, TextString *str2) {
01064   return str1->xMin < str2->xMin ||
01065      (str1->xMin == str2->xMin && str1->yMin < str2->yMin);
01066 }
01067 
01068 // Returns true if <blk1> should be inserted before <blk2> in xy
01069 // order.
01070 GBool TextPage::xyBefore(TextBlock *blk1, TextBlock *blk2) {
01071   return blk1->xMin < blk2->xMin ||
01072      (blk1->xMin == blk2->xMin && blk1->yMin < blk2->yMin);
01073 }
01074 
01075 // Returns true if <blk1> should be inserted before <blk2> in yx
01076 // order, allowing a little slack for vertically overlapping text.
01077 GBool TextPage::yxBefore(TextBlock *blk1, TextBlock *blk2) {
01078   double h1, h2, overlap;
01079 
01080   h1 = blk1->yMax - blk1->yMin;
01081   h2 = blk2->yMax - blk2->yMin;
01082   overlap = ((blk1->yMax < blk2->yMax ? blk1->yMax : blk2->yMax) -
01083          (blk1->yMin > blk2->yMin ? blk1->yMin : blk2->yMin)) /
01084             (h1 < h2 ? h1 : h2);
01085   if (overlap > 0.6) {
01086     return blk1->xMin < blk2->xMin;
01087   }
01088   return blk1->yMin < blk2->yMin;
01089 }
01090 
01091 double TextPage::coalesceFit(TextString *str1, TextString *str2) {
01092   double h1, h2, w1, w2, r, overlap, spacing;
01093 
01094   h1 = str1->yMax - str1->yMin;
01095   h2 = str2->yMax - str2->yMin;
01096   w1 = str1->xMax - str1->xMin;
01097   w2 = str2->xMax - str2->xMin;
01098   r = h1 / h2;
01099   if (r < (1.0 / 3.0) || r > 3) {
01100     return 10;
01101   }
01102   overlap = ((str1->yMax < str2->yMax ? str1->yMax : str2->yMax) -
01103          (str1->yMin > str2->yMin ? str1->yMin : str2->yMin)) /
01104             (h1 < h2 ? h1 : h2);
01105   if (overlap < 0.5) {
01106     return 10;
01107   }
01108   spacing = (str2->xMin - str1->xMax) / (h1 > h2 ? h1 : h2);
01109   if (spacing < -0.5) {
01110     return 10;
01111   }
01112   // separate text that overlaps - duplicated text (so that fake
01113   // boldface and shadowed text can be cleanly removed)
01114   if ((str2->xMin - str1->xMax) / (w1 < w2 ? w1 : w2) < -0.7) {
01115     return 10;
01116   }
01117   return spacing;
01118 }
01119 
01120 void TextPage::clear() {
01121   TextLine *p1, *p2;
01122   TextString *s1, *s2;
01123 
01124   if (curStr) {
01125     delete curStr;
01126     curStr = NULL;
01127   }
01128   if (lines) {
01129     for (p1 = lines; p1; p1 = p2) {
01130       p2 = p1->next;
01131       delete p1;
01132     }
01133   } else if (xyStrings) {
01134     for (s1 = xyStrings; s1; s1 = s2) {
01135       s2 = s1->next;
01136       delete s1;
01137     }
01138   }
01139   xyStrings = NULL;
01140   xyCur1 = xyCur2 = NULL;
01141   lines = NULL;
01142   nest = 0;
01143   nTinyChars = 0;
01144 }
01145 
01146 //------------------------------------------------------------------------
01147 // TextOutputDev
01148 //------------------------------------------------------------------------
01149 
01150 static void outputToFile(void *stream, char *text, int len) {
01151   fwrite(text, 1, len, (FILE *)stream);
01152 }
01153 
01154 TextOutputDev::TextOutputDev(char *fileName, GBool rawOrderA, GBool append) {
01155   text = NULL;
01156   rawOrder = rawOrderA;
01157   ok = gTrue;
01158 
01159   // open file
01160   needClose = gFalse;
01161   if (fileName) {
01162     if (!strcmp(fileName, "-")) {
01163       outputStream = stdout;
01164     } else if ((outputStream = fopen(fileName, append ? "ab" : "wb"))) {
01165       needClose = gTrue;
01166     } else {
01167       error(-1, "Couldn't open text file '%s'", fileName);
01168       ok = gFalse;
01169       return;
01170     }
01171     outputFunc = &outputToFile;
01172   } else {
01173     outputStream = NULL;
01174   }
01175 
01176   // set up text object
01177   text = new TextPage(rawOrder);
01178 }
01179 
01180 TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream,
01181                  GBool rawOrderA) {
01182   outputFunc = func;
01183   outputStream = stream;
01184   needClose = gFalse;
01185   rawOrder = rawOrderA;
01186   text = new TextPage(rawOrder);
01187   ok = gTrue;
01188 }
01189 
01190 TextOutputDev::~TextOutputDev() {
01191   if (needClose) {
01192 #ifdef MACOS
01193     ICS_MapRefNumAndAssign((short)((FILE *)outputStream)->handle);
01194 #endif
01195     fclose((FILE *)outputStream);
01196   }
01197   if (text) {
01198     delete text;
01199   }
01200 }
01201 
01202 void TextOutputDev::startPage(int /*pageNum*/, GfxState */*state*/) {
01203   text->clear();
01204 }
01205 
01206 void TextOutputDev::endPage() {
01207   text->coalesce();
01208   if (outputStream) {
01209     text->dump(outputStream, outputFunc);
01210   }
01211 }
01212 
01213 void TextOutputDev::updateFont(GfxState *state) {
01214   text->updateFont(state);
01215 }
01216 
01217 void TextOutputDev::beginString(GfxState *state, GString */*s*/) {
01218   text->beginString(state, state->getCurX(), state->getCurY());
01219 }
01220 
01221 void TextOutputDev::endString(GfxState */*state*/) {
01222   text->endString();
01223 }
01224 
01225 void TextOutputDev::drawChar(GfxState *state, double x, double y,
01226                  double dx, double dy,
01227                  double /*originX*/, double /*originY*/,
01228                  CharCode /*c*/, Unicode *u, int uLen) {
01229   text->addChar(state, x, y, dx, dy, u, uLen);
01230 }
01231 
01232 GBool TextOutputDev::findText(Unicode *s, int len,
01233                   GBool top, GBool bottom,
01234                   double *xMin, double *yMin,
01235                   double *xMax, double *yMax) {
01236   return text->findText(s, len, top, bottom, xMin, yMin, xMax, yMax);
01237 }
01238 
01239 GString *TextOutputDev::getText(double xMin, double yMin,
01240                 double xMax, double yMax) {
01241   return text->getText(xMin, yMin, xMax, yMax);
01242 }
01243 
KDE Home | KDE Accessibility Home | Description of Access Keys