00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include "FilterPage.h"
00021
00022 #include <math.h>
00023
00024 #include <kglobal.h>
00025 #include <kdebug.h>
00026
00027 #include "data.h"
00028 #include "transform.h"
00029 #include "dialog.h"
00030
00031 #define TIME_START(str) { \
00032 kdDebug(30516) << str << endl; \
00033 _time.restart(); \
00034 }
00035 #define TIME_END { kdDebug(30516) << "elapsed=" << _time.elapsed() << endl; }
00036
00037
00038 namespace PDFImport
00039 {
00040
00041
00042 Page::Page(Data &data)
00043 : TextPage(false), _data(data), _lastStr(0), _rects(Nb_ParagraphTypes)
00044 {
00045 _links.setAutoDelete(true);
00046 }
00047
00048 void Page::clear()
00049 {
00050 TextPage::clear();
00051 _lastStr = 0;
00052 _links.clear();
00053 _pars.clear();
00054 }
00055
00056 void Page::beginString(GfxState *state, double x0, double y0)
00057 {
00058
00059
00060 if (curStr) {
00061 ++nest;
00062 return;
00063 }
00064
00065
00066 curStr = new String(state, x0, y0, fontSize, _data.textIndex());
00067
00068 }
00069
00070 void Page::endString()
00071 {
00072
00073
00074
00075 TextPage::endString();
00076
00077 }
00078
00079 void Page::addString(TextString *str)
00080 {
00081
00082
00083 if (_lastStr) _lastStr->checkCombination(str);
00084 _lastStr = (str->len==0 ? 0 : static_cast<String *>(str));
00085
00086
00087
00088
00089 TextPage::addString(str);
00090
00091 }
00092
00093 TextBlock *Page::block(TextLine *line, int index)
00094 {
00095 uint k = 0;
00096 if ( index<0 )
00097 for (TextBlock *block = line->blocks; block; block = block->next) k++;
00098 k += index;
00099 uint i = 0;
00100 for (TextBlock *block = line->blocks; block; block = block->next) {
00101 if ( i==k ) return block;
00102 i++;
00103 }
00104 return 0;
00105 }
00106
00107
00108 bool Page::isLastParagraphLine(TextLine *line, const Paragraph &par)
00109 {
00110
00111 if ( line->next==0 ) return true;
00112 double dy = line->next->yMin - line->yMax;
00113 double ndy = line->next->yMax - line->next->yMin;
00114 String *str = static_cast<String *>(line->blocks->strings);
00115 String *nStr = static_cast<String *>(line->next->blocks->strings);
00116
00117 if ( dy>0.5*ndy ) return true;
00118
00119 if ( str->frameIndex()!=nStr->frameIndex() ) return true;
00120 if ( line->blocks==0 ) return false;
00121
00122 if (line->blocks->next) return true;
00123 if ( line->next && line->next->blocks==0 ) return false;
00124
00125 if ( line->next && line->next->blocks->next ) return true;
00126 TextBlock *b = block(line, -1);
00127 if ( b==0 || b->len==0 ) return false;
00128 QChar c = QChar(b->text[b->len-1]);
00129
00130 if ( c!='.' && c!=':' ) return false;
00131
00132 return ( !equal(b->xMax, par.rect().right()) );
00133 }
00134
00135 void Page::createParagraphs()
00136 {
00137 TextLine *first = lines;
00138 uint nbLines = 0;
00139 for (TextLine *line = lines; line; line = line->next) {
00140 nbLines++;
00141 Paragraph par(first, nbLines);
00142 if ( isLastParagraphLine(line, par) ) {
00143 _pars.push_back(par);
00144 nbLines = 0;
00145 first = line->next;
00146 }
00147 }
00148 }
00149
00150 void Page::checkHeader()
00151 {
00152 uint s = _pars.size();
00153 if ( s==0 ) return;
00154 Paragraph &par = _pars[0];
00155 if ( par.lines().count()!=1 ) return;
00156 const TextLine *first = par.lines().first();
00157 const TextLine *second = (s>1 ? _pars[1].lines().first() : 0);
00158 double limit = 0.2 * _data.pageRect().height();
00159 double delta = 2 * kMin(first->yMax - first->yMin, 12.0);
00160
00161
00162
00163
00164 if ( first->yMax>limit ) return;
00165 if ( second && (second->yMin-first->yMax)<delta ) return;
00166 par.type = Header;
00167 _rects[Header] = par.rect();
00168 }
00169
00170 bool Page::hasHeader() const
00171 {
00172 return (_pars.size()>0 ? _pars[0].type==Header : false);
00173 }
00174
00175 void Page::checkFooter()
00176 {
00177 uint s = _pars.size();
00178 if ( s==0 ) return;
00179 Paragraph &par = _pars[s-1];
00180 if ( par.lines().count()!=1 ) return;
00181 const TextLine *last = par.lines().first();
00182 const TextLine *blast = (s>1 ? _pars[s-2].lines().last() : 0);
00183 double limit = 0.8 * _data.pageRect().height();
00184 double delta = 2 * kMin(last->yMax-last->yMin, 12.0);
00185
00186
00187
00188
00189 if ( last->yMin<limit ) return;
00190 if ( blast && (last->yMin-blast->yMax)<delta ) return;
00191 par.type = Footer;
00192 _rects[Footer] = par.rect();
00193 }
00194
00195 bool Page::hasFooter() const
00196 {
00197 return (_pars.size()>0 ? _pars[_pars.size()-1].type==Footer
00198 : false);
00199 }
00200
00201 void Page::endPage()
00202 {
00203 TIME_START("coalesce strings");
00204 TextPage::coalesce();
00205 TIME_END;
00206
00207 createParagraphs();
00208
00209
00210 checkHeader();
00211
00212 checkFooter();
00213
00214
00215
00216 uint begin = (hasHeader() ? 1 : 0);
00217 uint end = _pars.size() - (hasFooter() ? 1 : 0);
00218 for (uint i=begin; i<end; i++)
00219 _rects[Body].unite(_pars[i].rect());
00220 }
00221
00222
00223
00224 void Page::initParagraph(Paragraph &par) const
00225 {
00226 bool rightAligned = true, centered = true, leftAligned = true;
00227 const double pleft = _rects[par.type].left();
00228 const double pright = _rects[par.type].right();
00229 const double pmean = (pleft + pright) / 2;
00230
00231 QValueList<TextLine *>::const_iterator it;
00232 for (it = par.lines().begin(); it!=par.lines().end(); ++it) {
00233
00234
00235 Tabulator tab;
00236 for (TextBlock *blk = (*it)->blocks; blk; blk = blk->next) {
00237
00238
00239 double tabRightAligned = equal(blk->xMax, pright);
00240 double dx = (tabRightAligned ? pright : blk->xMin) - pleft;
00241
00242
00243 if (tabRightAligned) dx -= 0.1;
00244 int res = par.findTab(dx, *it);
00245 if ( res==-1 ) {
00246 tab.pos = dx;
00247 if (tabRightAligned) {
00248 tab.alignment = Tabulator::Right;
00249 kdDebug(30516) << "tabulated text right aligned.." << endl;
00250 } else tab.alignment = Tabulator::Left;
00251 par.tabs.push_back(tab);
00252 }
00253 }
00254 qHeapSort2(par.tabs);
00255
00256
00257 double left = (*it)->blocks->xMin - pleft;
00258 if ( par.isFirst(*it) ) {
00259 par.firstIndent = left;
00260 par.leftIndent = left;
00261 } else if ( par.isSecond(*it) ) par.leftIndent = left;
00262 else par.leftIndent = kMin(par.leftIndent, left);
00263 }
00264
00265
00266 for (it = par.lines().begin(); it!=par.lines().end(); ++it) {
00267 double left = (*it)->blocks->xMin;
00268 double right = block(*it, -1)->xMax;
00269 double mean = (left + right) / 2;
00270
00271
00272
00273
00274
00275
00276
00277 if ( centered && !equal(mean, pmean) ) centered = false;
00278 if ( leftAligned && (!par.isFirst(*it) || par.hasOneLine())
00279 && !equal(left, pleft + par.leftIndent, 0.05) ) {
00280 kdDebug(30516) << "not left aligned" << endl;
00281 leftAligned = false;
00282 }
00283 if ( rightAligned && (!par.isLast(*it) || par.hasOneLine())
00284 && !equal(right, pright, 0.05) ) {
00285 kdDebug(30516) << "not right aligned" << endl;
00286 rightAligned = false;
00287 }
00288 }
00289
00290
00291 if (rightAligned) par.align = (leftAligned ? AlignBlock : AlignRight);
00292 else if (centered) par.align = AlignCenter;
00293 }
00294
00295 void Page::fillParagraph(Paragraph &par, double &offset) const
00296 {
00297 const double pleft = _rects[par.type].left();
00298 const double pright = _rects[par.type].right();
00299 par.offset = par.lines().first()->yMin - offset;
00300
00301
00302
00303 if ( par.offset>0 ) offset += par.offset;
00304
00305 QValueList<TextLine *>::const_iterator it;
00306 for (it = par.lines().begin(); it!=par.lines().end(); ++it) {
00307
00308 if ( !par.isFirst(*it) ) {
00309 bool hyphen = false;
00310 if (_data.options().smart) {
00311
00312 uint bi, pbi;
00313 int si = par.charFromEnd(0, bi);
00314 Q_ASSERT( si>=0 );
00315 QChar c = par.blocks[bi].text[si];
00316 int psi = par.charFromEnd(1, pbi);
00317 QChar prev = (psi<0 ? QChar::null : par.blocks[pbi].text[psi]);
00318 if ( !prev.isNull() && type(c.unicode())==Hyphen )
00319 kdDebug(30516) << "hyphen ? " << QString(prev)
00320 << " type=" << type(prev.unicode())
00321 << endl;
00322 TextString *next =
00323 ((*it)->next ? (*it)->next->blocks->strings : 0);
00324 if ( !prev.isNull() && type(c.unicode())==Hyphen
00325 && isLetter( type(prev.unicode()) )
00326 && next && next->len>0
00327 && isLetter( type(next->text[next->len-1]) ) ) {
00328 kdDebug(30516) << "found hyphen" << endl;
00329 hyphen = true;
00330 par.blocks[bi].text.remove(si, 1);
00331 }
00332 }
00333 if ( !hyphen ) {
00334 Block b;
00335 bool remove = _data.options().smart;
00336 if ( remove && par.align!=AlignBlock )
00337 remove = ( par.rect().right()>0.9*pright );
00338 b.text = (remove ? ' ' : '\n');
00339 b.font = static_cast<String *>((*it)->blocks->strings)->font();
00340 par.blocks.push_back(b);
00341 }
00342 }
00343
00344 int lineHeight = 0;
00345 TextBlock *prevBlk = 0;
00346 for (TextBlock *blk = (*it)->blocks; blk; blk = blk->next) {
00347
00348
00349 double tabRightAligned = equal(blk->xMax, pright);
00350 double dx = (tabRightAligned ? pright : blk->xMin) - pleft;
00351 int res = par.findTab(dx, *it);
00352 if ( res>=0 ) {
00353 if (prevBlk) {
00354 double xMax = prevBlk->xMax - pleft;
00355 res = par.findNbTabs(res, xMax);
00356 if ( res==0 ) continue;
00357 } else res++;
00358
00359
00360 if ( prevBlk || !_data.options().smart
00361 || (par.align!=AlignCenter && par.align!=AlignRight) ) {
00362 Block b;
00363 b.font = static_cast<String *>(blk->strings)->font();
00364 for (uint k=0; k<(uint)res; k++) b.text += '\t';
00365 par.blocks.push_back(b);
00366 }
00367 }
00368
00369
00370 for (TextString *str = blk->strings; str; str = str->next) {
00371 Block b;
00372 for (uint k = 0; k<uint(str->len); k++)
00373 b.text += QChar(str->text[k]);
00374 if (str->spaceAfter) b.text += ' ';
00375 String *fstr = static_cast<String *>(str);
00376 b.font = fstr->font();
00377 b.link = fstr->link;
00378 par.blocks.push_back(b);
00379 lineHeight = kMax(lineHeight, b.font.height());
00380 }
00381
00382 prevBlk = blk;
00383 }
00384
00385 offset += lineHeight;
00386 }
00387 }
00388
00389 FontFamily Page::checkSpecial(QChar &c, const Font &font) const
00390 {
00391 Unicode res = 0;
00392 switch ( PDFImport::checkSpecial(c.unicode(), res) ) {
00393 case Bullet:
00394 kdDebug(30516) << "found bullet" << endl;
00395
00396
00397 c = res;
00398 return Symbol;
00399 case SuperScript:
00400 kdDebug(30516) << "found superscript" << endl;
00401
00402 break;
00403 case LatexSpecial:
00404 if ( !font.isLatex() ) break;
00405 kdDebug(30516) << "found latex special" << endl;
00406 return Times;
00407 case SpecialSymbol:
00408 kdDebug(30516) << "found symbol=" << c.unicode() << endl;
00409 return Times;
00410
00411 default:
00412 break;
00413 }
00414
00415 return Nb_Family;
00416 }
00417
00418 void Page::checkSpecialChars(Paragraph &par) const
00419 {
00420 QValueList<Block> blocks;
00421 for (uint k=0; k<par.blocks.size(); k++) {
00422 const Block &b = par.blocks[k];
00423 QString res;
00424
00425 for (uint l=0; l<b.text.length(); l++) {
00426 QChar c = b.text[l];
00427 FontFamily family = checkSpecial(c, b.font);
00428 if ( family==Nb_Family ) res += c;
00429 else {
00430 if ( !res.isEmpty() ) {
00431 blocks.push_back(b);
00432 blocks.back().text = res;
00433 res = QString::null;
00434 }
00435 blocks.push_back(b);
00436 blocks.back().font.setFamily(family);
00437 blocks.back().text = c;
00438 }
00439 }
00440 if ( !res.isEmpty() ) {
00441 blocks.push_back(b);
00442 blocks.back().text = res;
00443 }
00444 }
00445 par.blocks = blocks;
00446 }
00447
00448 void Page::coalesce(Paragraph &par) const
00449 {
00450 QValueList<Block> blocks;
00451 blocks.push_back(par.blocks[0]);
00452 for (uint k=1; k<par.blocks.size(); k++) {
00453 const Block &b = par.blocks[k];
00454 if ( b.link==blocks.back().link && b.font==blocks.back().font )
00455 blocks.back().text += b.text;
00456 else blocks.push_back(b);
00457 }
00458 par.blocks = blocks;
00459 }
00460
00461 void Page::prepare()
00462 {
00463 TIME_START("associate links");
00464 for (Link *link=_links.first(); link; link=_links.next()) {
00465 const DRect &r = link->rect();
00466
00467 for (TextLine *line = lines; line; line = line->next)
00468 for (TextBlock *blk = line->blocks; blk; blk = blk->next)
00469 for (TextString *str = blk->strings; str; str = str->next) {
00470 String *fstr = static_cast<String *>(str);
00471 DRect sr = fstr->rect();
00472
00473
00474 if ( r.isInside(sr) ) fstr->link = link;
00475 }
00476 }
00477 TIME_END;
00478
00479 TIME_START("init paragraphs");
00480 for (uint i=0; i<_pars.size(); i++) {
00481 initParagraph(_pars[i]);
00482
00483
00484 if ( _pars[i].align==AlignBlock && _pars[i].hasOneLine()
00485 && _pars[i].tabs.size()==0
00486 && (_pars.size()==1
00487 || (i!=0 && _pars[i-1].align==AlignCenter)
00488 || ((i+1)!=_pars.size() && _pars[i+1].align==AlignCenter)) )
00489 _pars[i].align = AlignCenter;
00490 }
00491 TIME_END;
00492
00493 TIME_START("fill paragraphs");
00494 uint begin = 0;
00495 if ( hasHeader() ) {
00496 double offset = _rects[Header].top();
00497 fillParagraph(_pars[0], offset);
00498 begin++;
00499 }
00500 uint end = _pars.size();
00501 if ( hasFooter() ) {
00502 double offset = _rects[Footer].top();
00503 end--;
00504 fillParagraph(_pars[end], offset);
00505 }
00506 double offset = _rects[Body].top();
00507 for (uint i=begin; i<end; i++)
00508 fillParagraph(_pars[i], offset);
00509 TIME_END;
00510
00511 TIME_START("check for special chars");
00512 for (uint i=0; i<_pars.size(); i++)
00513 checkSpecialChars(_pars[i]);
00514 TIME_END;
00515
00516
00517 TIME_START("coalesce formats");
00518 for (uint i=0; i<_pars.size(); i++)
00519 coalesce(_pars[i]);
00520 TIME_END;
00521
00522
00523 if ( _pars.size()==0 ) {
00524 Block b;
00525 Paragraph par(0, 0);
00526 par.blocks.push_back(b);
00527 _pars.push_back(par);
00528 }
00529 }
00530
00531 void Page::dump(const Paragraph &par)
00532 {
00533 QValueVector<QDomElement> layouts;
00534 QValueVector<QDomElement> formats;
00535
00536
00537 for (uint k=0; k<par.tabs.size(); k++) {
00538 QDomElement element = par.tabs[k].createElement(_data);
00539 layouts.push_back(element);
00540 }
00541
00542
00543 if ( !_data.options().smart || par.align!=AlignCenter ) {
00544 QDomElement element = _data.createElement("INDENTS");
00545 element.setAttribute("left", par.leftIndent);
00546 double delta = par.firstIndent - par.leftIndent;
00547 if ( !equal(delta, 0) ) element.setAttribute("first", delta);
00548 layouts.push_back(element);
00549 }
00550
00551
00552 if ( par.offset>0 ) {
00553 QDomElement element = _data.createElement("OFFSETS");
00554 element.setAttribute("before", par.offset);
00555 layouts.push_back(element);
00556 }
00557
00558
00559 if (_data.options().smart) {
00560 QString flow;
00561
00562 switch (par.align) {
00563 case AlignLeft: break;
00564 case AlignRight: flow = "right"; break;
00565 case AlignCenter: flow = "center"; break;
00566 case AlignBlock: flow = "justify"; break;
00567 }
00568 if ( !flow.isEmpty() ) {
00569 QDomElement element = _data.createElement("FLOW");
00570 element.setAttribute("align", flow.utf8());
00571 layouts.push_back(element);
00572 }
00573 }
00574
00575
00576 QString text;
00577 uint pos = 0;
00578 for (uint k=0; k<par.blocks.size(); k++) {
00579 const Block &b = par.blocks[k];
00580 text += (b.link ? "#" : b.text);
00581 uint len = (b.link ? 1 : b.text.length());
00582 QDomElement element = _data.createElement("FORMAT");
00583 QDomDocument document = _data.document();
00584 bool r = b.font.format(document, element, pos, len);
00585 if (b.link) b.link->format(document, element, pos, b.text);
00586 if ( r || b.link ) formats.push_back(element);
00587 pos += len;
00588 }
00589
00590 _data.createParagraph(text, par.type, layouts, formats);
00591 }
00592
00593 void Page::dump()
00594 {
00595 prepare();
00596
00597 TIME_START("dump XML");
00598 for (uint i=0; i<_pars.size(); i++)
00599 dump(_pars[i]);
00600 TIME_END;
00601 }
00602
00603 }