Fix value for font-family in html by removing the subset tag from the…

… PDF font-name (#357) * Fix font name by removing subset tag * Added line to CHANGELOG.md * Add documentation and clear variable name * Use `html.escape()` to encode strings for html and always return `str` instead of `bytes`
pdfminer · Jan 16, 2020 · 410d7ec · 410d7ec
1 parent fff3ac2
commit 410d7ec
Show file tree

Hide file tree

Showing 4 changed files with 15 additions and 13 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 - Interpret two's complement integer as unsigned integer ([#352](https://github.com/pdfminer/pdfminer.six/pull/352))
+- Fix font name in html output such that it is recognized by browser ([#357](https://github.com/pdfminer/pdfminer.six/pull/357))
 - Compute correct font height by removing scaling with font bounding box height ([#348](https://github.com/pdfminer/pdfminer.six/pull/348))
 - KeyError when extracting embedded files and a Unicode file specification is missing ([#338](https://github.com/pdfminer/pdfminer.six/pull/338))
 

diff --git a/pdfminer/converter.py b/pdfminer/converter.py
@@ -292,7 +292,7 @@ def write_footer(self):
         return
 
     def write_text(self, text):
-        self.write(enc(text, None))
+        self.write(enc(text))
         return
 
     def place_rect(self, color, borderwidth, x, y, w, h):
@@ -317,7 +317,7 @@ def place_image(self, item, borderwidth, x, y, w, h):
             name = self.imagewriter.export_image(item)
             s = '<img src="%s" border="%d" style="position:absolute; ' \
                 'left:%dpx; top:%dpx;" width="%d" height="%d" />\n' % \
-                (enc(name, None), borderwidth, x * self.scale,
+                (enc(name), borderwidth, x * self.scale,
                  (self._yoffset - y) * self.scale, w * self.scale,
                  h * self.scale)
             self.write(s)
@@ -358,8 +358,11 @@ def put_text(self, text, fontname, fontsize):
         if font != self._font:
             if self._font is not None:
                 self.write('</span>')
+            # Remove subset tag from fontname, see PDF Reference 5.5.3
+            fontname_without_subset_tag = fontname.split('+')[-1]
             self.write('<span style="font-family: %s; font-size:%dpx">' %
-                       (enc(fontname), fontsize * self.scale * self.fontscale))
+                       (fontname_without_subset_tag,
+                        fontsize * self.scale * self.fontscale))
             self._font = font
         self.write_text(text)
         return
@@ -479,7 +482,7 @@ def write_footer(self):
     def write_text(self, text):
         if self.stripcontrol:
             text = self.CONTROL.sub('', text)
-        self.write(enc(text, None))
+        self.write(enc(text))
         return
 
     def receive_layout(self, ltpage):
@@ -544,7 +547,7 @@ def render(item):
             elif isinstance(item, LTChar):
                 s = '<text font="%s" bbox="%s" colourspace="%s" ' \
                     'ncolour="%s" size="%.3f">' % \
-                    (enc(item.fontname, None), bbox2str(item.bbox),
+                    (enc(item.fontname), bbox2str(item.bbox),
                      item.ncs.name, item.graphicstate.ncolor, item.size)
                 self.write(s)
                 self.write_text(item.get_text())
@@ -555,7 +558,7 @@ def render(item):
                 if self.imagewriter is not None:
                     name = self.imagewriter.export_image(item)
                     self.write('<image src="%s" width="%d" height="%d" />\n' %
-                               (enc(name, None), item.width, item.height))
+                               (enc(name), item.width, item.height))
                 else:
                     self.write('<image width="%d" height="%d" />\n' %
                                (item.width, item.height))

diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py
@@ -156,7 +156,7 @@ def render_string(self, textstate, seq, ncs, graphicstate):
                 except PDFUnicodeNotDefined:
                     print(chars)
                     pass
-        self.outfp.write(utils.enc(text, self.codec))
+        self.outfp.write(utils.enc(text))
         return
 
     def begin_page(self, page, ctm):

diff --git a/pdfminer/utils.py b/pdfminer/utils.py
@@ -2,6 +2,8 @@
 Miscellaneous Routines.
 """
 import struct
+from html import escape
+
 import chardet  # For str encoding detection
 
 # from sys import maxint as INF doesn't work anymore under Python3, but PDF
@@ -250,15 +252,11 @@ def decode_text(s):
         return ''.join(PDFDocEncoding[c] for c in s)
 
 
-def enc(x, codec='ascii'):
+def enc(x):
     """Encodes a string for SGML/XML/HTML"""
     if isinstance(x, bytes):
         return ''
-    x = x.replace('&', '&amp;').replace('>', '&gt;').replace('<', '&lt;') \
-        .replace('"', '&quot;')
-    if codec:
-        x = x.encode(codec, 'xmlcharrefreplace')
-    return x
+    return escape(x)
 
 
 def bbox2str(bbox):