Skip to content

Commit

Permalink
feat: Preserve custom element tags
Browse files Browse the repository at this point in the history
  • Loading branch information
ras0q committed Jan 23, 2025
1 parent 118f015 commit cd3c160
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 1 deletion.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ The `options` object accepts a number of properties, all optional:
* `serializer` (function, default `el => el.innerHTML`) controls how the `content` property returned by the `parse()` method is produced from the root DOM element. It may be useful to specify the `serializer` as the identity function (`el => el`) to obtain a DOM element instead of a string for `content` if you plan to process it further.
* `allowedVideoRegex` (RegExp, default `undefined` ): a regular expression that matches video URLs that should be allowed to be included in the article content. If `undefined`, the [default regex](https://github.com/mozilla/readability/blob/8e8ec27cd2013940bc6f3cc609de10e35a1d9d86/Readability.js#L133) is applied.
* `linkDensityModifier` (number, default `0`): a number that is added to the base link density threshold during the shadiness checks. This can be used to penalize nodes with a high link density or vice versa.
* `tagsToPreserve` (array, default `[]`): a set of tags to preserve on HTML elements additionally to the default set.

### `parse()`

Expand Down
7 changes: 6 additions & 1 deletion Readability.js
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ function Readability(doc, options) {
this._disableJSONLD = !!options.disableJSONLD;
this._allowedVideoRegex = options.allowedVideoRegex || this.REGEXPS.videos;
this._linkDensityModifier = options.linkDensityModifier || 0;
this._tagsToPreserve = options.tagsToPreserve || [];

// Start with all flags set
this._flags =
Expand Down Expand Up @@ -1156,7 +1157,11 @@ Readability.prototype = {
continue;
}

if (this.DEFAULT_TAGS_TO_SCORE.includes(node.tagName)) {
const tagsToScore = [
...this.DEFAULT_TAGS_TO_SCORE,
...this._tagsToPreserve,
]
if (tagsToScore.includes(node.tagName)) {
elementsToScore.push(node);
}

Expand Down
1 change: 1 addition & 0 deletions index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ export class Readability<T = string> {
serializer?: (node: Node) => T;
disableJSONLD?: boolean;
allowedVideoRegex?: RegExp;
tagsToPreserve?: string[];
}
);

Expand Down
24 changes: 24 additions & 0 deletions test/test-readability.js
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,14 @@ describe("Readability API", function () {
new Readability(doc, { allowedVideoRegex })._allowedVideoRegex
).eql(allowedVideoRegex);
});

it("should accept a tagsToPreserve option", function () {
expect(new Readability(doc)._tagsToPreserve).eql([]);
expect(
new Readability(doc, { tagsToPreserve: ["my-custom-tag"] })
._tagsToPreserve
).eql(["my-custom-tag"]);
});
});

describe("#parse", function () {
Expand Down Expand Up @@ -356,6 +364,22 @@ describe("Readability API", function () {
}).parse().content;
expect(content).eql(expected_xhtml);
});

it("should use custom tags to preserve sent as option", function () {
var dom = new JSDOM(
"<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc mollis leo lacus, vitae semper nisl ullamcorper ut.</p>" +
"<my-custom-tag><p>My Custom Tag</p></my-custom-tag>"
);
var expected_xhtml =
'<div id="readability-page-1" class="page">' +
"<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc mollis leo lacus, vitae semper nisl ullamcorper ut.</p>" +
"<my-custom-tag><p>My Custom Tag</p></my-custom-tag>" +
"</div>";
var content = new Readability(dom.window.document, {
tagsToPreserve: ["my-custom-tag"],
}).parse().content;
expect(content).eql(expected_xhtml);
});
});
});

Expand Down

0 comments on commit cd3c160

Please sign in to comment.