summaryrefslogtreecommitdiffstats
path: root/src/extension/internal/pdfinput/poppler-utils.cpp
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-13 11:50:49 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-13 11:50:49 +0000
commitc853ffb5b2f75f5a889ed2e3ef89b818a736e87a (patch)
tree7d13a0883bb7936b84d6ecdd7bc332b41ed04bee /src/extension/internal/pdfinput/poppler-utils.cpp
parentInitial commit. (diff)
downloadinkscape-c853ffb5b2f75f5a889ed2e3ef89b818a736e87a.tar.xz
inkscape-c853ffb5b2f75f5a889ed2e3ef89b818a736e87a.zip
Adding upstream version 1.3+ds.upstream/1.3+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/extension/internal/pdfinput/poppler-utils.cpp')
-rw-r--r--src/extension/internal/pdfinput/poppler-utils.cpp599
1 files changed, 599 insertions, 0 deletions
diff --git a/src/extension/internal/pdfinput/poppler-utils.cpp b/src/extension/internal/pdfinput/poppler-utils.cpp
new file mode 100644
index 0000000..26746dc
--- /dev/null
+++ b/src/extension/internal/pdfinput/poppler-utils.cpp
@@ -0,0 +1,599 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/** @file
+ * PDF parsing utilities for libpoppler.
+ *//*
+ * Authors:
+ * Martin Owens
+ *
+ * Copyright (C) 2022 Authors
+ *
+ * Released under GNU GPL v2+, read the file 'COPYING' for more information.
+ */
+
+#include "poppler-utils.h"
+
+#include "2geom/affine.h"
+#include "GfxFont.h"
+#include "GfxState.h"
+#include "PDFDoc.h"
+#include "libnrtype/font-factory.h"
+
+/**
+ * Get the default transformation state from the GfxState
+ */
+Geom::Affine stateToAffine(GfxState *state)
+{
+ return ctmToAffine(state->getCTM());
+}
+
+/**
+ * Convert a transformation matrix to a lib2geom affine object.
+ */
+Geom::Affine ctmToAffine(const double *ctm)
+{
+ if (!ctm)
+ return Geom::identity();
+ return Geom::Affine(ctm[0], ctm[1], ctm[2], ctm[3], ctm[4], ctm[5]);
+}
+
+void ctmout(const char *label, const double *ctm)
+{
+ std::cout << "C:" << label << ":" << ctm[0] << "," << ctm[1] << "," << ctm[2] << "," << ctm[3] << "," << ctm[4]
+ << "," << ctm[5] << "\n";
+}
+
+void affout(const char *label, Geom::Affine ctm)
+{
+ std::cout << "A:" << label << ":" << ctm[0] << "," << ctm[1] << "," << ctm[2] << "," << ctm[3] << "," << ctm[4]
+ << "," << ctm[5] << "\n";
+}
+
+//------------------------------------------------------------------------
+// GfxFontDict from GfxFont.cc in poppler 22.09
+//
+// Modified under the Poppler project - http://poppler.freedesktop.org
+//
+// All changes made under the Poppler project to this file are licensed
+// under GPL version 2 or later
+//
+// See poppler source code for full list of copyright holders.
+//------------------------------------------------------------------------
+
+InkFontDict::InkFontDict(XRef *xref, Ref *fontDictRef, Dict *fontDict)
+{
+ Ref r;
+
+ fonts.resize(fontDict->getLength());
+ for (std::size_t i = 0; i < fonts.size(); ++i) {
+ const Object &obj1 = fontDict->getValNF(i);
+ Object obj2 = obj1.fetch(xref);
+ if (obj2.isDict()) {
+ if (obj1.isRef()) {
+ r = obj1.getRef();
+ } else if (fontDictRef) {
+ // legal generation numbers are five digits, so we use a
+ // 6-digit number here
+ r.gen = 100000 + fontDictRef->num;
+ r.num = i;
+ } else {
+ // no indirect reference for this font, or for the containing
+ // font dict, so hash the font and use that
+ r.gen = 100000;
+ r.num = hashFontObject(&obj2);
+ }
+ // Newer poppler will require some reworking as it gives a shared ptr.
+ fonts[i] = GfxFont::makeFont(xref, fontDict->getKey(i), r, obj2.getDict());
+ if (fonts[i] && !fonts[i]->isOk()) {
+ fonts[i] = nullptr;
+ }
+ } else {
+ error(errSyntaxError, -1, "font resource is not a dictionary");
+ fonts[i] = nullptr;
+ }
+ }
+}
+
+FontPtr InkFontDict::lookup(const char *tag) const
+{
+ for (const auto &font : fonts) {
+ if (font && font->matches(tag)) {
+ return font;
+ }
+ }
+ return nullptr;
+}
+
+// FNV-1a hash
+class FNVHash
+{
+public:
+ FNVHash() { h = 2166136261U; }
+
+ void hash(char c)
+ {
+ h ^= c & 0xff;
+ h *= 16777619;
+ }
+
+ void hash(const char *p, int n)
+ {
+ int i;
+ for (i = 0; i < n; ++i) {
+ hash(p[i]);
+ }
+ }
+
+ int get31() { return (h ^ (h >> 31)) & 0x7fffffff; }
+
+private:
+ unsigned int h;
+};
+
+int InkFontDict::hashFontObject(Object *obj)
+{
+ FNVHash h;
+
+ hashFontObject1(obj, &h);
+ return h.get31();
+}
+
+void InkFontDict::hashFontObject1(const Object *obj, FNVHash *h)
+{
+ const GooString *s;
+ const char *p;
+ double r;
+ int n, i;
+
+ switch (obj->getType()) {
+ case objBool:
+ h->hash('b');
+ h->hash(obj->getBool() ? 1 : 0);
+ break;
+ case objInt:
+ h->hash('i');
+ n = obj->getInt();
+ h->hash((char *)&n, sizeof(int));
+ break;
+ case objReal:
+ h->hash('r');
+ r = obj->getReal();
+ h->hash((char *)&r, sizeof(double));
+ break;
+ case objString:
+ h->hash('s');
+ s = obj->getString();
+ h->hash(s->c_str(), s->getLength());
+ break;
+ case objName:
+ h->hash('n');
+ p = obj->getName();
+ h->hash(p, (int)strlen(p));
+ break;
+ case objNull:
+ h->hash('z');
+ break;
+ case objArray:
+ h->hash('a');
+ n = obj->arrayGetLength();
+ h->hash((char *)&n, sizeof(int));
+ for (i = 0; i < n; ++i) {
+ const Object &obj2 = obj->arrayGetNF(i);
+ hashFontObject1(&obj2, h);
+ }
+ break;
+ case objDict:
+ h->hash('d');
+ n = obj->dictGetLength();
+ h->hash((char *)&n, sizeof(int));
+ for (i = 0; i < n; ++i) {
+ p = obj->dictGetKey(i);
+ h->hash(p, (int)strlen(p));
+ const Object &obj2 = obj->dictGetValNF(i);
+ hashFontObject1(&obj2, h);
+ }
+ break;
+ case objStream:
+ // this should never happen - streams must be indirect refs
+ break;
+ case objRef:
+ h->hash('f');
+ n = obj->getRefNum();
+ h->hash((char *)&n, sizeof(int));
+ n = obj->getRefGen();
+ h->hash((char *)&n, sizeof(int));
+ break;
+ default:
+ h->hash('u');
+ break;
+ }
+}
+
+std::string getNameWithoutSubsetTag(FontPtr font)
+{
+ if (!font->getName())
+ return {};
+
+ std::string tagname = font->getName()->c_str();
+ unsigned int i;
+ for (i = 0; i < tagname.size(); ++i) {
+ if (tagname[i] < 'A' || tagname[i] > 'Z') {
+ break;
+ }
+ }
+ if (i != 6 || tagname.size() <= 7 || tagname[6] != '+')
+ return tagname;
+ return tagname.substr(7);
+}
+
+/**
+ * Extract all the useful information from the GfxFont object
+ */
+FontData::FontData(FontPtr font)
+{
+ // Level one parsing is taking the data from the PDF font, although this
+ // information is almost always missing. Perhaps sometimes it's not.
+ found = false;
+
+ // Style: italic, oblique, normal
+ style = font->isItalic() ? "italic" : "";
+
+ // Weight: normal, bold, etc
+ weight = "normal";
+ switch (font->getWeight()) {
+ case GfxFont::WeightNotDefined:
+ break;
+ case GfxFont::W400:
+ weight = "normal";
+ break;
+ case GfxFont::W700:
+ weight = "bold";
+ break;
+ default:
+ weight = std::to_string(font->getWeight() * 100);
+ break;
+ }
+
+ // Stretch: condensed or expanded
+ stretch = "";
+ switch (font->getStretch()) {
+ case GfxFont::UltraCondensed:
+ stretch = "ultra-condensed";
+ break;
+ case GfxFont::ExtraCondensed:
+ stretch = "extra-condensed";
+ break;
+ case GfxFont::Condensed:
+ stretch = "condensed";
+ break;
+ case GfxFont::SemiCondensed:
+ stretch = "semi-condensed";
+ break;
+ case GfxFont::Normal:
+ stretch = "normal";
+ break;
+ case GfxFont::SemiExpanded:
+ stretch = "semi-expanded";
+ break;
+ case GfxFont::Expanded:
+ stretch = "expanded";
+ break;
+ case GfxFont::ExtraExpanded:
+ stretch = "extra-expanded";
+ break;
+ case GfxFont::UltraExpanded:
+ stretch = "ultra-expanded";
+ break;
+ }
+
+ name = getNameWithoutSubsetTag(font);
+ // Use this when min-poppler version is newer:
+ // name = font->getNameWithoutSubsetTag();
+
+ // Embeded CID Fonts don't have family names
+ if (!font->getFamily())
+ return;
+
+ family = font->getFamily()->c_str();
+
+ // Level two parsing, we break off the font description part of the name
+ // which often contains font data and use it as a pango font description.
+ auto desc_str = family;
+ auto pos = name.find("-");
+ if (pos != std::string::npos) {
+ // Insert spaces where we see capital letters.
+ std::stringstream ret;
+ auto str = name.substr(pos + 1, name.size());
+ for (char l : str) {
+ if (l >= 'A' && l <= 'Z')
+ ret << " ";
+ ret << l;
+ }
+ desc_str = desc_str + ret.str();
+ }
+
+ // Now we pull data out of the description.
+ if (auto desc = pango_font_description_from_string(desc_str.c_str())) {
+ auto new_family = pango_font_description_get_family(desc);
+ if (FontFactory::get().hasFontFamily(new_family)) {
+ family = new_family;
+
+ // Style from pango description
+ switch (pango_font_description_get_style(desc)) {
+ case PANGO_STYLE_ITALIC:
+ style = "italic";
+ break;
+ case PANGO_STYLE_OBLIQUE:
+ style = "oblique";
+ break;
+ }
+
+ // Weight from pango description
+ auto pw = pango_font_description_get_weight(desc);
+ if (pw != PANGO_WEIGHT_NORMAL) {
+ weight = std::to_string(pw); // Number 100-1000
+ }
+
+ // Stretch from pango description
+ switch (pango_font_description_get_stretch(desc)) {
+ case PANGO_STRETCH_ULTRA_CONDENSED:
+ stretch = "ultra-condensed";
+ break;
+ case PANGO_STRETCH_EXTRA_CONDENSED:
+ stretch = "extra-condensed";
+ break;
+ case PANGO_STRETCH_CONDENSED:
+ stretch = "condensed";
+ break;
+ case PANGO_STRETCH_SEMI_CONDENSED:
+ stretch = "semi-condensed";
+ break;
+ case PANGO_STRETCH_SEMI_EXPANDED:
+ stretch = "semi-expanded";
+ break;
+ case PANGO_STRETCH_EXPANDED:
+ stretch = "expanded";
+ break;
+ case PANGO_STRETCH_EXTRA_EXPANDED:
+ stretch = "extra-expanded";
+ break;
+ case PANGO_STRETCH_ULTRA_EXPANDED:
+ stretch = "ultra-expanded";
+ break;
+ }
+
+ // variant = TODO Convert to variant pango_font_description_get_variant(desc)
+
+ found = true;
+ // All information has been processed, don't over-write with level three.
+ return;
+ }
+ // Sometimes it's possible to match the description string directly.
+ if (auto desc = pango_font_description_from_string(family.c_str())) {
+ auto new_family = pango_font_description_get_family(desc);
+ if (FontFactory::get().hasFontFamily(new_family)) {
+ family = new_family;
+ }
+ }
+ }
+
+ found = FontFactory::get().hasFontFamily(family);
+ // TODO: If !found we could suggest a substitute
+
+ // Level three parsing, we take our name and attempt to match known style names
+ // Copy id-name stored in PDF and make it lower case and strip whitespaces
+ std::string source = name;
+ transform(source.begin(), source.end(), source.begin(), ::tolower);
+ source.erase(std::remove_if(source.begin(), source.end(), ::isspace), source.end());
+ auto contains = [=](const std::string &other) { return source.find(other) != std::string::npos; };
+
+ if (contains("italic") || contains("slanted")) {
+ style = "italic";
+ } else if (contains("oblique")) {
+ style = "oblique";
+ }
+
+ // Ordered by string matching pass through.
+ static std::map<std::string, std::string> weights{
+ // clang-format off
+ {"bold", "bold"},
+ {"ultrabold", "800"},
+ {"extrabold", "800"},
+ {"demibold", "600"},
+ {"semibold", "600"},
+ {"thin", "100"},
+ {"ultralight", "200"},
+ {"extralight", "200"},
+ {"light", "300"},
+ {"black", "900"},
+ {"heavy", "900"},
+ {"medium", "500"},
+ {"book", "normal"},
+ {"regular", "normal"},
+ {"roman", "normal"},
+ {"normal", "normal"},
+ // clang-format on
+ };
+ // Apply the font weight translations
+ for (auto w : weights) {
+ if (contains(w.first))
+ weight = w.second;
+ }
+
+ static std::map<std::string, std::string> stretches{
+ // clang-format off
+ {"ultracondensed", "ultra-condensed"},
+ {"extracondensed", "extra-condensed"},
+ {"semicondensed", "semi-condensed"},
+ {"condensed", "condensed"},
+ {"ultraexpanded", "ultra-expanded"},
+ {"extraexpanded", "extra-expanded"},
+ {"semiexpanded", "semi-expanded"},
+ {"expanded", "expanded"},
+ // clang-format on
+ };
+ // Apply the font weight translations
+ for (auto s : stretches) {
+ if (contains(s.first))
+ stretch = s.second;
+ }
+}
+
+/*
+ MatchingChars
+ Count for how many characters s1 matches sp taking into account
+ that a space in sp may be removed or replaced by some other tokens
+ specified in the code. (Bug LP #179589)
+*/
+static size_t MatchingChars(std::string s1, std::string sp)
+{
+ size_t is = 0;
+ size_t ip = 0;
+
+ while (is < s1.length() && ip < sp.length()) {
+ if (s1[is] == sp[ip]) {
+ is++;
+ ip++;
+ } else if (sp[ip] == ' ') {
+ ip++;
+ if (s1[is] == '_') { // Valid matches to spaces in sp.
+ is++;
+ }
+ } else {
+ break;
+ }
+ }
+ return ip;
+}
+
+/*
+ * Scan the available fonts to find the font name that best match.
+ *
+ * If nothing can be matched, returns an empty string.
+ */
+std::string FontData::getSubstitute() const
+{
+ if (found)
+ return "";
+
+ double bestMatch = 0;
+ std::string bestFontname = "";
+
+ for (auto fontname : FontFactory::get().GetAllFontNames()) {
+ // At least the first word of the font name should match.
+ size_t minMatch = fontname.find(" ");
+ if (minMatch == std::string::npos) {
+ minMatch = fontname.length();
+ }
+
+ size_t Match = MatchingChars(family, fontname);
+ if (Match >= minMatch) {
+ double relMatch = (float)Match / (fontname.length() + family.length());
+ if (relMatch > bestMatch) {
+ bestMatch = relMatch;
+ bestFontname = fontname;
+ }
+ }
+ }
+ return bestFontname.empty() ? "Arial" : bestFontname;
+}
+
+std::string FontData::getSpecification() const
+{
+ return family + (style.empty() ? "" : "-" + style);
+}
+
+//------------------------------------------------------------------------
+// scanFonts from FontInfo.cc
+//------------------------------------------------------------------------
+
+void _getFontsRecursive(std::shared_ptr<PDFDoc> pdf_doc, Dict *resources, const FontList &fontsList,
+ std::set<int> &visitedObjects, int page)
+{
+ auto xref = pdf_doc->getXRef();
+
+ InkFontDict *fontDict = nullptr;
+ const Object &obj1 = resources->lookupNF("Font");
+ if (obj1.isRef()) {
+ Object obj2 = obj1.fetch(xref);
+ if (obj2.isDict()) {
+ auto r = obj1.getRef();
+ fontDict = new InkFontDict(xref, &r, obj2.getDict());
+ }
+ } else if (obj1.isDict()) {
+ fontDict = new InkFontDict(xref, nullptr, obj1.getDict());
+ }
+
+ if (fontDict) {
+ for (int i = 0; i < fontDict->getNumFonts(); ++i) {
+ auto font = fontDict->getFont(i);
+ if (fontsList->find(font) == fontsList->end()) {
+ // Create new font data
+ fontsList->emplace(font, FontData(font));
+ }
+ fontsList->at(font).pages.insert(page);
+ }
+ }
+
+ // recursively scan any resource dictionaries in objects in this resource dictionary
+ const char *resTypes[] = {"XObject", "Pattern"};
+ for (const char *resType : resTypes) {
+ Object objDict = resources->lookup(resType);
+ if (!objDict.isDict())
+ continue;
+
+ for (int i = 0; i < objDict.dictGetLength(); ++i) {
+ Ref obj2Ref;
+ const Object obj2 = objDict.getDict()->getVal(i, &obj2Ref);
+ if (obj2Ref != Ref::INVALID() && !visitedObjects.insert(obj2Ref.num).second)
+ continue;
+
+ if (!obj2.isStream())
+ continue;
+
+ Ref resourcesRef;
+ const Object resObj = obj2.streamGetDict()->lookup("Resources", &resourcesRef);
+ if (resourcesRef != Ref::INVALID() && !visitedObjects.insert(resourcesRef.num).second)
+ continue;
+
+ if (resObj.isDict() && resObj.getDict() != resources) {
+ _getFontsRecursive(pdf_doc, resObj.getDict(), fontsList, visitedObjects, page);
+ }
+ }
+ }
+}
+
+FontList getPdfFonts(std::shared_ptr<PDFDoc> pdf_doc)
+{
+ auto fontsList = std::make_shared<std::map<FontPtr, FontData>>();
+ auto count = pdf_doc->getCatalog()->getNumPages();
+ std::set<int> visitedObjects;
+
+ for (auto page_num = 1; page_num <= count; page_num++) {
+ auto page = pdf_doc->getCatalog()->getPage(page_num);
+ auto resources = page->getResourceDict();
+ _getFontsRecursive(pdf_doc, resources, fontsList, visitedObjects, page_num);
+ }
+ return fontsList;
+}
+
+
+std::string getDictString(Dict *dict, const char *key)
+{
+ Object obj = dict->lookup(key);
+
+ if (!obj.isString()) {
+ return "";
+ }
+
+ const GooString *value = obj.getString();
+ if (value->hasUnicodeMarker()) {
+ return g_convert(value->getCString () + 2, value->getLength () - 2,
+ "UTF-8", "UTF-16BE", NULL, NULL, NULL);
+ } else if (value->hasUnicodeMarkerLE()) {
+ return g_convert(value->getCString () + 2, value->getLength () - 2,
+ "UTF-8", "UTF-16LE", NULL, NULL, NULL);
+ }
+ return value->toStr();
+}
+
+