diff --git a/fixtures/www.cnet.com/1482428196806.html b/fixtures/www.cnet.com/1482428196806.html new file mode 100644 index 000000000..99223e1fe --- /dev/null +++ b/fixtures/www.cnet.com/1482428196806.html @@ -0,0 +1,113 @@ + Seven mobile trends to look for in 2017 - CNET
HolidayBuyer's Guide
Close
Drag
\ No newline at end of file diff --git a/src/extractors/custom/index.js b/src/extractors/custom/index.js index 61bb872ce..48117f87e 100644 --- a/src/extractors/custom/index.js +++ b/src/extractors/custom/index.js @@ -54,6 +54,7 @@ export * from './uproxx.com'; export * from './www.eonline.com'; export * from './www.miamiherald.com'; export * from './www.refinery29.com'; +export * from './www.cnet.com'; export * from './www.cinemablend.com'; export * from './www.today.com'; export * from './www.howtogeek.com'; diff --git a/src/extractors/custom/www.cnet.com/index.js b/src/extractors/custom/www.cnet.com/index.js new file mode 100644 index 000000000..2b2fd6bdb --- /dev/null +++ b/src/extractors/custom/www.cnet.com/index.js @@ -0,0 +1,61 @@ +export const WwwCnetComExtractor = { + domain: 'www.cnet.com', + + title: { + selectors: [ + ['meta[name="og:title"]', 'value'], + ], + }, + + author: { + selectors: [ + 'a.author', + ], + }, + + date_published: { + selectors: [ + 'time', + ], + + timezone: 'America/Los_Angeles', + }, + + dek: { + selectors: [ + '.article-dek', + ], + }, + + lead_image_url: { + selectors: [ + ['meta[name="og:image"]', 'value'], + ], + }, + + content: { + selectors: [ + ['img.__image-lead__', '.article-main-body'], + '.article-main-body', + ], + + // Is there anything in the content you selected that needs transformed + // before it's consumable content? E.g., unusual lazy loaded images + transforms: { + 'figure.image': ($node) => { + const $img = $node.find('img'); + $img.attr('width', '100%'); + $img.attr('height', '100%'); + $img.addClass('__image-lead__'); + $node.remove('.imgContainer').prepend($img); + }, + }, + + // Is there anything that is in the result that shouldn't be? + // The clean selectors will remove anything that matches from + // the result + clean: [ + + ], + }, +}; diff --git a/src/extractors/custom/www.cnet.com/index.test.js b/src/extractors/custom/www.cnet.com/index.test.js new file mode 100644 index 000000000..eed1eeb32 --- /dev/null +++ b/src/extractors/custom/www.cnet.com/index.test.js @@ -0,0 +1,97 @@ +import assert from 'assert'; +import fs from 'fs'; +import URL from 'url'; +import cheerio from 'cheerio'; + +import Mercury from 'mercury'; +import getExtractor from 'extractors/get-extractor'; +import { excerptContent } from 'utils/text'; + +describe('WwwCnetComExtractor', () => { + describe('initial test case', () => { + let result; + let url; + beforeAll(() => { + url = + 'https://www.cnet.com/news/seven-mobile-trends-to-look-for-in-2017/'; + const html = + fs.readFileSync('./fixtures/www.cnet.com/1482428196806.html'); + result = + Mercury.parse(url, html, { fallback: false }); + }); + + it('is selected properly', () => { + // This test should be passing by default. + // It sanity checks that the correct parser + // is being selected for URLs from this domain + const extractor = getExtractor(url); + assert.equal(extractor.domain, URL.parse(url).hostname); + }); + + it('returns the title', async () => { + // To pass this test, fill out the title selector + // in ./src/extractors/custom/www.cnet.com/index.js. + const { title } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(title, 'Seven mobile trends to look for in 2017'); + }); + + it('returns the author', async () => { + // To pass this test, fill out the author selector + // in ./src/extractors/custom/www.cnet.com/index.js. + const { author } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(author, 'Marguerite Reardon'); + }); + + it('returns the date_published', async () => { + // To pass this test, fill out the date_published selector + // in ./src/extractors/custom/www.cnet.com/index.js. + const { date_published } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(date_published, '2016-12-22T13:00:00.000Z'); + }); + + it('returns the dek', async () => { + // To pass this test, fill out the dek selector + // in ./src/extractors/custom/www.cnet.com/index.js. + const { dek } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(dek, 'An anti-regulation, pro-dealmaking US president could make for an interesting year for wireless and broadband companies.'); + }); + + it('returns the lead_image_url', async () => { + // To pass this test, fill out the lead_image_url selector + // in ./src/extractors/custom/www.cnet.com/index.js. + const { lead_image_url } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(lead_image_url, 'https://cnet2.cbsistatic.com/img/eW0A_hGjyVcT0oRB_23SDCOdEGU=/670x503/2016/12/16/48966e5f-87df-4915-ad68-ed70cdd37fdd/gettyimages-537252007.jpg'); + }); + + it('returns the content', async () => { + // To pass this test, fill out the content selector + // in ./src/extractors/custom/www.cnet.com/index.js. + // You may also want to make use of the clean and transform + // options. + const { content } = await result; + + const $ = cheerio.load(content || ''); + + const first13 = excerptContent($('*').first().text(), 13); + + // Update these values with the expected values from + // the article. + assert.equal(first13, 'Faster networks, more "free data" and a dismantling of net neutrality rules could'); + }); + }); +});