Skip to content

Commit

Permalink
feat: ability to add custom extractors via api (#484)
Browse files Browse the repository at this point in the history
* feat: ability to add custom extractors via api

* docs: updating readme

* fix: example.com was being used in another test

* fix: timezone was messing up date_published test

* fix: using a unique site for testing

* fix: updated custom extractor api

* docs: updating readme

* fix: removing unused fixture

* fix: updating test description

* feat: ability to add custom extractors via cli
  • Loading branch information
mtashley authored Sep 4, 2019
1 parent f95947f commit e12c916
Show file tree
Hide file tree
Showing 10 changed files with 3,186 additions and 3 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,9 @@ mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --ext

# Get the value of attributes by adding a pipe to --extend or --extend-list
mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --extend-list links=".body a|href"

# Pass optional --add-extractor argument to add a custom extractor at runtime.
mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --add-extractor ./src/extractors/fixtures/postlight.com/index.js
```

## License
Expand Down
31 changes: 28 additions & 3 deletions cli.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,25 @@ const {
l,
header,
h,
addExtractor,
x,
} = argv;
(async (urlToParse, contentType, extendedTypes, extendedListTypes, headers) => {
(async (
urlToParse,
contentType,
extendedTypes,
extendedListTypes,
headers,
addExtractor
) => {
if (!urlToParse) {
console.log(
'\n\
mercury-parser\n\n\
The Mercury Parser extracts semantic content from any url\n\n\
Usage:\n\
\n\
$ mercury-parser url-to-parse [--format=html|text|markdown] [--header.name=value]... [--extend type=selector]... [--extend-list type=selector]... \n\
$ mercury-parser url-to-parse [--format=html|text|markdown] [--header.name=value]... [--extend type=selector]... [--extend-list type=selector]... [--add-extractor path_to_extractor.js]... \n\
\n\
'
);
Expand All @@ -37,6 +46,7 @@ Usage:\n\
text: 'text',
txt: 'text',
};

const extensions = {};
[].concat(extendedTypes || []).forEach(t => {
const [name, selector] = t.split('=');
Expand All @@ -53,10 +63,18 @@ Usage:\n\
allowMultiple: true,
};
});

// Attempt to load custom extractor from path.
let customExtractor;
if (addExtractor) {
customExtractor = require(addExtractor);
}

const result = await Mercury.parse(urlToParse, {
contentType: contentTypeMap[contentType],
extend: extensions,
headers,
customExtractor,
});
console.log(JSON.stringify(result, null, 2));
} catch (e) {
Expand All @@ -75,4 +93,11 @@ Usage:\n\
console.error(`\n${reportBug}\n`);
process.exit(1);
}
})(url, format || f, extend || e, extendList || l, header || h);
})(
url,
format || f,
extend || e,
extendList || l,
header || h,
addExtractor || x
);
2,989 changes: 2,989 additions & 0 deletions fixtures/sandiegouniontribune.com/test.html

Large diffs are not rendered by default.

16 changes: 16 additions & 0 deletions src/extractors/add-extractor.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import mergeSupportedDomains from '../utils/merge-supported-domains';

export const apiExtractors = {};

export default function addExtractor(extractor) {
if (!extractor || !extractor.domain) {
return {
error: true,
message: 'Unable to add custom extractor. Invalid parameters.',
};
}

Object.assign(apiExtractors, mergeSupportedDomains(extractor));

return apiExtractors;
}
22 changes: 22 additions & 0 deletions src/extractors/add-extractor.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import assert from 'assert';

import addExtractor from './add-extractor';

describe('addExtractor(extractor)', () => {
it('can add multiple custom extractors', () => {
addExtractor({ domain: 'www.site1.com' });
addExtractor({ domain: 'www.site2.com' });
const result = addExtractor({ domain: 'www.site3.com' });
assert.equal(Object.keys(result).length, 3);
});

it('returns error if an extractor is not provided', () => {
const result = addExtractor();
assert.equal(result.error, true);
});

it('returns error if a domain key is not included within the custom extractor', () => {
const result = addExtractor({ test: 'abc' });
assert.equal(result.error, true);
});
});
59 changes: 59 additions & 0 deletions src/extractors/custom/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -349,3 +349,62 @@ This script will open both an `html` and `json` file allowing you to preview you
If you've written a custom extractor, please send us a pull request! Passing tests that demonstrate your parser in action will help us evaluate the parser.
Sometimes you may find that the site you're parsing doesn't provide certain information. For example, some sites don't have deks, and in those instances, you don't need to write a selector for that field. If there's a test for a selector you don't need, you can just remove that test and make note of it in your pull request.
---
## Adding Custom Extractor via API
As of **version 2.1.1**, you can additionally add custom private extractors via API. Make sure that your custom extractor includes a domain name. Note that extractors added via API will take precedence over the packaged custom extractors.
```javascript
const customExtractor = {
domain: 'www.sandiegouniontribune.com',
title: {
selectors: ['h1', '.ArticlePage-headline'],
},
author: {
selectors: ['.ArticlePage-authorInfo-bio-name'],
},
content: {
selectors: ['article'],
},
};

Mercury.addExtractor(customExtractor);
```
---
## Passing custom extractor to addExtractor via CLI
It's also possible to add a custom parser at runtime via the CLI.
### 1. Create your custom extractor in a standalone file.
```javascript
var customExtractor = {
domain: 'postlight.com',
title: {
selectors: ['h1'],
},
author: {
selectors: ['.byline-name'],
},
content: {
selectors: ['article'],
},
extend: {
uniqueKeyFromFixture: {
selectors: ['.single__hero-category'],
},
},
};

module.exports = customExtractor;
```
### 2. From the CLI, add the `--add-extractor` param:
```bash
mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --add-extractor ./src/extractors/fixtures/postlight.com/index.js
```
19 changes: 19 additions & 0 deletions src/extractors/fixtures/postlight.com/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
var customExtractor = {
domain: 'postlight.com',
title: {
selectors: ['h1'],
},
author: {
selectors: ['.byline-name'],
},
content: {
selectors: ['article'],
},
extend: {
uniqueKeyFromFixture: {
selectors: ['.single__hero-category'],
},
},
};

module.exports = customExtractor;
3 changes: 3 additions & 0 deletions src/extractors/get-extractor.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import URL from 'url';
import Extractors from './all';
import GenericExtractor from './generic';
import detectByHtml from './detect-by-html';
import { apiExtractors } from './add-extractor';

export default function getExtractor(url, parsedUrl, $) {
parsedUrl = parsedUrl || URL.parse(url);
Expand All @@ -13,6 +14,8 @@ export default function getExtractor(url, parsedUrl, $) {
.join('.');

return (
apiExtractors[hostname] ||
apiExtractors[baseDomain] ||
Extractors[hostname] ||
Extractors[baseDomain] ||
detectByHtml($) ||
Expand Down
11 changes: 11 additions & 0 deletions src/mercury.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import TurndownService from 'turndown';

import Resource from 'resource';
import { validateUrl } from 'utils';
import addCustomExtractor from 'extractors/add-extractor';
import getExtractor from 'extractors/get-extractor';
import RootExtractor, { selectExtendedTypes } from 'extractors/root-extractor';
import collectAllPages from 'extractors/collect-all-pages';
Expand All @@ -16,6 +17,7 @@ const Mercury = {
contentType = 'html',
headers = {},
extend,
customExtractor,
} = opts;

// if no url was passed and this is the browser version,
Expand Down Expand Up @@ -43,6 +45,11 @@ const Mercury = {
return $;
}

// Add custom extractor via cli.
if (customExtractor) {
addCustomExtractor(customExtractor);
}

const Extractor = getExtractor(url, parsedUrl, $);
// console.log(`Using extractor for ${Extractor.domain}`);

Expand Down Expand Up @@ -112,6 +119,10 @@ const Mercury = {
fetchResource(url) {
return Resource.create(url);
},

addExtractor(extractor) {
return addCustomExtractor(extractor);
},
};

export default Mercury;
36 changes: 36 additions & 0 deletions src/mercury.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -182,4 +182,40 @@ describe('Mercury', () => {
assert.equal(sites.length, 8);
assert.equal(sites[1], 'http://nymag.com/daily/intelligencer/');
});

it('is able to use custom extractors (with extension) added via api', async () => {
const url =
'https://www.sandiegouniontribune.com/business/growth-development/story/2019-08-27/sdsu-mission-valley-stadium-management-firm';
const html = fs.readFileSync(
'./fixtures/sandiegouniontribune.com/test.html',
'utf8'
);

const customExtractor = {
domain: 'www.sandiegouniontribune.com',
title: {
selectors: ['h1', '.ArticlePage-headline'],
},
author: {
selectors: ['.ArticlePage-authorInfo-bio-name'],
},
content: {
selectors: ['article'],
},
extend: {
testContent: {
selectors: ['.ArticlePage-breadcrumbs a'],
},
},
};

Mercury.addExtractor(customExtractor);

const result = await Mercury.parse(url, { html });
assert.equal(typeof result, 'object');
assert.equal(result.author, 'Jennifer Van Grove');
assert.equal(result.domain, 'www.sandiegouniontribune.com');
assert.equal(result.total_pages, 1);
assert.equal(result.testContent, 'Growth & Development');
});
});

0 comments on commit e12c916

Please sign in to comment.