Skip to content

Commit

Permalink
Fix value for font-family in html by removing the subset tag from the…
Browse files Browse the repository at this point in the history
… PDF font-name (#357)

* Fix font name by removing subset tag

* Added line to CHANGELOG.md

* Add documentation and clear variable name

* Use `html.escape()` to encode strings for html and always return `str` instead of `bytes`
  • Loading branch information
pietermarsman authored Jan 16, 2020
1 parent fff3ac2 commit 410d7ec
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 13 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

### Fixed
- Interpret two's complement integer as unsigned integer ([#352](https://github.com/pdfminer/pdfminer.six/pull/352))
- Fix font name in html output such that it is recognized by browser ([#357](https://github.com/pdfminer/pdfminer.six/pull/357))
- Compute correct font height by removing scaling with font bounding box height ([#348](https://github.com/pdfminer/pdfminer.six/pull/348))
- KeyError when extracting embedded files and a Unicode file specification is missing ([#338](https://github.com/pdfminer/pdfminer.six/pull/338))

Expand Down
15 changes: 9 additions & 6 deletions pdfminer/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ def write_footer(self):
return

def write_text(self, text):
self.write(enc(text, None))
self.write(enc(text))
return

def place_rect(self, color, borderwidth, x, y, w, h):
Expand All @@ -317,7 +317,7 @@ def place_image(self, item, borderwidth, x, y, w, h):
name = self.imagewriter.export_image(item)
s = '<img src="%s" border="%d" style="position:absolute; ' \
'left:%dpx; top:%dpx;" width="%d" height="%d" />\n' % \
(enc(name, None), borderwidth, x * self.scale,
(enc(name), borderwidth, x * self.scale,
(self._yoffset - y) * self.scale, w * self.scale,
h * self.scale)
self.write(s)
Expand Down Expand Up @@ -358,8 +358,11 @@ def put_text(self, text, fontname, fontsize):
if font != self._font:
if self._font is not None:
self.write('</span>')
# Remove subset tag from fontname, see PDF Reference 5.5.3
fontname_without_subset_tag = fontname.split('+')[-1]
self.write('<span style="font-family: %s; font-size:%dpx">' %
(enc(fontname), fontsize * self.scale * self.fontscale))
(fontname_without_subset_tag,
fontsize * self.scale * self.fontscale))
self._font = font
self.write_text(text)
return
Expand Down Expand Up @@ -479,7 +482,7 @@ def write_footer(self):
def write_text(self, text):
if self.stripcontrol:
text = self.CONTROL.sub('', text)
self.write(enc(text, None))
self.write(enc(text))
return

def receive_layout(self, ltpage):
Expand Down Expand Up @@ -544,7 +547,7 @@ def render(item):
elif isinstance(item, LTChar):
s = '<text font="%s" bbox="%s" colourspace="%s" ' \
'ncolour="%s" size="%.3f">' % \
(enc(item.fontname, None), bbox2str(item.bbox),
(enc(item.fontname), bbox2str(item.bbox),
item.ncs.name, item.graphicstate.ncolor, item.size)
self.write(s)
self.write_text(item.get_text())
Expand All @@ -555,7 +558,7 @@ def render(item):
if self.imagewriter is not None:
name = self.imagewriter.export_image(item)
self.write('<image src="%s" width="%d" height="%d" />\n' %
(enc(name, None), item.width, item.height))
(enc(name), item.width, item.height))
else:
self.write('<image width="%d" height="%d" />\n' %
(item.width, item.height))
Expand Down
2 changes: 1 addition & 1 deletion pdfminer/pdfdevice.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def render_string(self, textstate, seq, ncs, graphicstate):
except PDFUnicodeNotDefined:
print(chars)
pass
self.outfp.write(utils.enc(text, self.codec))
self.outfp.write(utils.enc(text))
return

def begin_page(self, page, ctm):
Expand Down
10 changes: 4 additions & 6 deletions pdfminer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
Miscellaneous Routines.
"""
import struct
from html import escape

import chardet # For str encoding detection

# from sys import maxint as INF doesn't work anymore under Python3, but PDF
Expand Down Expand Up @@ -250,15 +252,11 @@ def decode_text(s):
return ''.join(PDFDocEncoding[c] for c in s)


def enc(x, codec='ascii'):
def enc(x):
"""Encodes a string for SGML/XML/HTML"""
if isinstance(x, bytes):
return ''
x = x.replace('&', '&amp;').replace('>', '&gt;').replace('<', '&lt;') \
.replace('"', '&quot;')
if codec:
x = x.encode(codec, 'xmlcharrefreplace')
return x
return escape(x)


def bbox2str(bbox):
Expand Down

0 comments on commit 410d7ec

Please sign in to comment.