Extracting images with node.js "Warning: Native JPEG decoding failed -- trying to recover: Image is not defined" #9603

svenyonson · 2018-03-25T14:56:42Z

Unable to get this to work. Looking at other issues here, I tried adding 'disableNativeImageDecoder', but to no avail.
Error for each page is: "Warning: Native JPEG decoding failed -- trying to recover: Image is not defined"

Note: If I cut and paste this code into an interactive node.js session, it works because of var Image = require('canvas').Image;. But it doesn't help when running the following code as a script.

var PDFJS = require('pdfjs-dist');
var Canvas = require('canvas');
var Image = require('canvas').Image;

function proc(filename) {

	var fs = require('fs');
	var extractDir = 'extract/';

	if (!fs.existsSync(extractDir)){
	    fs.mkdirSync(extractDir); 
	}
	if (!fs.existsSync(filename)) {
		console.log("File not found: "+filename);
		return;
	}

    (async () => {
    	try {
			var doc = await PDFJS.getDocument({url: filename, disableNativeImageDecoder: true});
			var pages = doc.pdfInfo.numPages;

			for (let i=1; i <= pages; i++) {
				var page = await doc.getPage(i);
				var ops = await page.getOperatorList();
				for (let j=0; j < ops.fnArray.length; j++) {
			        if (ops.fnArray[j] == PDFJS.OPS.paintJpegXObject) {
			            var op = ops.argsArray[j][0];
			            img = page.objs.get(op);
			            // Fix resolution
              			var scale = img.width / page.pageInfo.view[2];
              			var viewport = page.getViewport(scale);
			            var canvas = new Canvas(img.width, img.height);
    					var context = canvas.getContext('2d');

    					await page.render({canvasContext: context, viewport: viewport});
              			
					    var imageData = canvas.toBuffer();
					    var ordinal = ("00" + i).slice (-3);
				    	fs.writeFile(extractDir+'image-'+ordinal+'.png', imageData, function (error) {
					        if (error) {
					          throw error;
					        }
					    });
			        }
				}
			}
			console.log('done');
			
    	} catch(e) {
    		console.log(e);
    	}

    })();
}

proc("testfile.pdf");

The text was updated successfully, but these errors were encountered:

Snuffleupagus · 2018-03-25T15:20:53Z

I tried adding 'disableNativeImageDecoder', but to no avail.

Please note that that parameter was deprecated in PR #8350, and was subsequently removed in PR #8982.

For environments without native Image support, such as e.g. Node.js, you need to set the nativeImageDecoderSupport option (to 'none' in this case) in getDocument to avoid these kind of errors; please refer to the latest API docs in

pdf.js/src/display/api.js

Lines 137 to 144 in 97faedb

    
            * @property {string} nativeImageDecoderSupport - (optional) Strategy for 
        
            *   decoding certain (simple) JPEG images in the browser. This is useful for 
        
            *   environments without DOM image and canvas support, such as e.g. Node.js. 
        
            *   Valid values are 'decode', 'display' or 'none'; where 'decode' is intended 
        
            *   for browsers with full image/canvas support, 'display' for environments 
        
            *   with limited image support through stubs (useful for SVG conversion), 
        
            *   and 'none' where JPEG images will be decoded entirely by PDF.js. 
        
            *   The default value is 'decode'.

svenyonson · 2018-03-25T16:12:52Z

Thanks for the quick reply - that fixed the warning, but the extraction still fails:

I added nativeImageDecoderSupport: 'none' to getDocument
This causes the operator list to include OPS.paintImageXObject instead of OPS.paintJpegXObject
I changed the test to OPS.paintImageXObject
page.render now fails with "ReferenceError: HTMLElement is not defined"

svenyonson · 2018-04-01T14:06:01Z

Tim, not sure why this is closed. I was not trying to use font-face, so I don't think I would need to pass this switch, but I tried it anyway. While I no longer get "Image not defined", using the switch nativeImageDecoderSupport: 'none' causes the error "HTMLElement is not defined". This simple example fails:

var PDFJS = require('pdfjs-dist');
var Canvas = require('canvas');
const {promisify} = require('util');
const fs = require('fs');
const writeFile = promisify(fs.writeFile); 

function proc(filename) {

    (async () => {
    	try {
			var doc = await PDFJS.getDocument({url: filename, nativeImageDecoderSupport: 'none', disableFontFace: true});
			var pages = doc.pdfInfo.numPages;

			for (let i=1; i <= pages; i++) {
				var page = await doc.getPage(i);
				var ops = await page.getOperatorList();

				for (let j=0; j < ops.fnArray.length; j++) {
			        if (ops.fnArray[j] == PDFJS.OPS.paintJpegXObject || ops.fnArray[j] == PDFJS.OPS.paintImageXObject) {
			            var op = ops.argsArray[j][0];
			            var img = page.objs.get(op);
              			var scale = img.width / page.pageInfo.view[2];
              			var viewport = page.getViewport(scale);
			            var canvas = new Canvas(img.width, img.height);
    					var context = canvas.getContext('2d');

    					await page.render({canvasContext: context, viewport: viewport});
              			
					    var imageData = canvas.toBuffer();
					    var ordinal = ("00" + i).slice (-3);
					    await writeFile('image-'+ordinal+'.png', imageData);
			        }
				}
			}
			console.log('done');
    	} catch(e) {
    		console.log(e);
    	}
    })();
}

proc("test.pdf");

svenyonson · 2018-04-01T15:03:03Z

Here is a small test file (4 pages)
test.pdf

timvandermeij · 2018-04-02T14:47:22Z

The HTMLElement is not defined error is fixed by #9588, so there should be no remaining issues here, which is why it was closed (automatically). If after applying #9588 and #9618 there are still problems, please open a new issue, but those two should fix everything that was reported.

shartoo · 2021-09-05T01:01:28Z

Hi there @svenyonson ,there is a blog which could parse image from PDF successful. Hope this could help
https://codepen.io/Sphinxxxx/pen/MxwGQZ

timvandermeij added other node-specific labels Mar 25, 2018

timvandermeij mentioned this issue Apr 1, 2018

Improve the instructions and code for the pdf2png example #9618

Merged

timvandermeij closed this as completed in #9618 Apr 1, 2018

svenyonson mentioned this issue Apr 15, 2018

Reference Error: document is not defined when extracting images (ref: #9603) #9667

Closed

nokome mentioned this issue Jun 12, 2019

PDF: Implement decode to extract rPNG source code stencila/encoda#106

Closed

mozilla deleted a comment from CetinSert Sep 8, 2021

This comment has been minimized.

Sign in to view

CetinSert mentioned this issue Sep 8, 2021

🦊 PDF.js ❓ Image Extraction Question 9603 #3 pdf-ist/WebPDF#4

Closed

This comment has been minimized.

Sign in to view

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Extracting images with node.js "Warning: Native JPEG decoding failed -- trying to recover: Image is not defined" #9603

Extracting images with node.js "Warning: Native JPEG decoding failed -- trying to recover: Image is not defined" #9603

svenyonson commented Mar 25, 2018 •

edited

Loading

Snuffleupagus commented Mar 25, 2018

svenyonson commented Mar 25, 2018

svenyonson commented Apr 1, 2018

svenyonson commented Apr 1, 2018

timvandermeij commented Apr 2, 2018 •

edited

Loading

shartoo commented Sep 5, 2021

This comment has been minimized.

This comment has been minimized.

This comment has been minimized.

Extracting images with node.js "Warning: Native JPEG decoding failed -- trying to recover: Image is not defined" #9603

Extracting images with node.js "Warning: Native JPEG decoding failed -- trying to recover: Image is not defined" #9603

Comments

svenyonson commented Mar 25, 2018 • edited Loading

Snuffleupagus commented Mar 25, 2018

svenyonson commented Mar 25, 2018

svenyonson commented Apr 1, 2018

svenyonson commented Apr 1, 2018

timvandermeij commented Apr 2, 2018 • edited Loading

shartoo commented Sep 5, 2021

This comment has been minimized.

This comment has been minimized.

This comment has been minimized.

svenyonson commented Mar 25, 2018 •

edited

Loading

timvandermeij commented Apr 2, 2018 •

edited

Loading