Skip to content

Commit

Permalink
Merge pull request #16 from kyle-seongwoo-jun/fix/2024-renewed
Browse files Browse the repository at this point in the history
Update website with 2024 renewal and refactor code
  • Loading branch information
kyle-seongwoo-jun authored Mar 13, 2024
2 parents 59d338c + de72cf0 commit ed080c7
Showing 1 changed file with 97 additions and 92 deletions.
189 changes: 97 additions & 92 deletions scripts/scrape-from-apple.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,118 +9,123 @@ interface DeviceDictionary {
[id: string]: string | string[]
}

const websites = {
'MacBook': 'https://support.apple.com/en-us/HT201608',
'MacBook Air': 'https://support.apple.com/en-us/HT201862',
'MacBook Pro': 'https://support.apple.com/en-us/HT201300',
'iMac': 'https://support.apple.com/en-us/HT201634',
'Mac mini': 'https://support.apple.com/en-us/HT201894',
'Mac Pro': 'https://support.apple.com/en-us/HT202888',
}

async function loadDevicesFrom(url: string): Promise<Device[]> {
// request HTML from URL
const html = await fetch(url).then(res => res.text())
const document = new DOMParser().parseFromString(html, 'text/html')

if (!document) {
console.error('[ERROR] failed to parse HTML from', url)
return []
}

const header = document.querySelector('.gb-header')?.innerText
const isNewPage = header !== undefined
console.log(`[INFO] parsing ${url} (${isNewPage ? `new, ${header}` : 'old'})`)

try {
const devices = isNewPage ? parseNewPage(document) : parseOldPage(document)
console.log(`[INFO] parsed ${devices.length} devices from ${url}`)
return devices
} catch (e) {
console.error('[ERROR] failed to parse HTML from', url, 'error:', e)
return []
class AppleWebsiteParser {
private MODEL_IDENTIFIER = 'Model Identifier: '
private collator = new Intl.Collator(undefined, { numeric: true, sensitivity: 'base' })

public async loadDevicesFrom(url: string): Promise<Device[]>
public async loadDevicesFrom(urls: string[]): Promise<Device[]>
public async loadDevicesFrom(urlOrUrls: string | string[]): Promise<Device[]> {
if (typeof urlOrUrls === 'string') {
return this._loadDevicesFrom(urlOrUrls)
} else {
return ([] as Device[]).concat(
...await Promise.all(urlOrUrls.map(url => this._loadDevicesFrom(url)))
)
}
}
}

function parseOldPage(document: HTMLDocument): Device[] {
const div = document.querySelector('#sections')
if (!div) {
throw new Error('failed to find #sections element')
private async _loadDevicesFrom(url: string): Promise<Device[]> {
// request HTML from URL
const html = await fetch(url).then(res => res.text())
const document = new DOMParser().parseFromString(html, 'text/html')

if (!document) {
console.error('[ERROR] failed to parse HTML from', url)
return []
}

const pageTitle = document.querySelector('.gb-header')?.innerText
console.log(`[INFO] parsing ${url} (${pageTitle})`)

try {
const devices = this.parseDevicesFrom(document)
console.log(`[INFO] parsed ${devices.length} devices from ${url}`)
return devices
} catch (e) {
console.error('[ERROR] failed to parse HTML from', url, 'error:', e)
return []
}
}

// parse with regexp
const html = div.innerHTML.replaceAll('&nbsp;', ' ')
const matches = [...html.matchAll(/<strong>([^<]+?)(<br>\n)? ?<\/strong>(.+\n)?(.+\n)?(.+\n)?Model Identifier: (.+?) ?<br>/g)]

const devices: Device[] = []
matches.forEach(group => {
const ids = group[6].replace(';', ',').split(', ')
const name = group[1]
ids.forEach(id => devices.push({ id, name }))
})

return devices
}

function parseNewPage(document: HTMLDocument): Device[] {
const MODEL_IDENTIFIER = 'Model Identifier: '
private parseDevicesFrom(document: HTMLDocument): Device[] {
const names = this.parseNamesFrom(document)
const ids = this.parseIdsFrom(document)
if (names.length !== ids.length) {
throw new Error('names and ids are not matched')
}

const names = [...document.querySelectorAll('p.gb-paragraph b')].map(b => (b as Element).innerText.trim())
const ids = [...document.querySelectorAll('p.gb-paragraph')].filter(p => (p as Element).innerText.startsWith(MODEL_IDENTIFIER)).map(p => (p as Element).innerText.replace(MODEL_IDENTIFIER, ''))
const devices = names.map((name, i) => {
const id = ids[i]
return id.split(/; |, /).map(id => ({ id, name }))
}).flat()

if (names.length !== ids.length) {
throw new Error('names and ids are not matched')
return devices
}

const devices = names.map((name, i) => {
let id = ids[i]
private parseNamesFrom(document: HTMLDocument): string[] {
const names = this.parseTextsFrom(document, 'p.gb-paragraph b')

// Apple might have temporarily miswritten the document, as currently, on https://support.apple.com/en-us/102852,
// there is no line break between Model Identifier and Part Numbers in the description of the Mac mini (2023) model.
// Therefore, a separate handling for this model has been added
// (planned to be removed when the document is updated).
if (id.includes('Part Numbers:')) {
id = id.split('Part Numbers:')[0].trim()
// if there's a colon at the end of these field, it's 2024 renewed website
// so we need to parse names in new way
const is2024Renewed = names.some(name => name.endsWith(':'))
if (is2024Renewed) {
const names = this.parseTextsFrom(document, 'h2.gb-header')
return names
}

// some devices have multiple identifiers
return id.split('; ').map(id => ({ id, name }))
}).flat()
return names
}

return devices
}
private parseIdsFrom(document: HTMLDocument): string[] {
const ids = this.parseTextsFrom(document, 'p.gb-paragraph')
.filter(text => text.startsWith(this.MODEL_IDENTIFIER))
.map(text => text.replace(this.MODEL_IDENTIFIER, ''))

function toDict(devices: Device[]) {
const dict: DeviceDictionary = {}
return ids
}

// natural sort by id
const collator = new Intl.Collator(undefined, { numeric: true, sensitivity: 'base' })
devices.sort((a, b) => collator.compare(a.id, b.id))
private parseTextsFrom(document: HTMLDocument, selector: string): string[] {
return [...document.querySelectorAll(selector)].map(b => (b as Element).innerText.trim())
}

// array to object
devices.forEach(device => {
if (!Object.keys(dict).includes(device.id)) {
dict[device.id] = device.name
} else if (typeof dict[device.id] === "string") {
dict[device.id] = [dict[device.id] as string, device.name].sort()
} else {
const array = dict[device.id] as string[]
array.push(device.name)
array.sort()
}
})
public toDict(devices: Device[]) {
const dict: DeviceDictionary = {}

// natural sort by id
devices.sort((a, b) => this.collator.compare(a.id, b.id))

// array to object
devices.forEach(device => {
if (!Object.keys(dict).includes(device.id)) {
dict[device.id] = device.name
} else if (typeof dict[device.id] === "string") {
dict[device.id] = [dict[device.id] as string, device.name].sort()
} else {
const array = dict[device.id] as string[]
array.push(device.name)
array.sort()
}
})

return dict
}
}

return dict
const MAC_WEBSITES = {
'MacBook': 'https://support.apple.com/en-us/HT201608',
'MacBook Air': 'https://support.apple.com/en-us/HT201862',
'MacBook Pro': 'https://support.apple.com/en-us/HT201300',
'iMac': 'https://support.apple.com/en-us/HT201634',
'Mac mini': 'https://support.apple.com/en-us/HT201894',
'Mac Pro': 'https://support.apple.com/en-us/HT202888',
}

console.log('generating...')

const devices = ([] as Device[]).concat(
...await Promise.all(
Object.values(websites).map(url => loadDevicesFrom(url))
)
)
const dict = toDict(devices)
const parser = new AppleWebsiteParser()
const devices = await parser.loadDevicesFrom(Object.values(MAC_WEBSITES))
const dict = parser.toDict(devices)
const json = JSON.stringify(dict, null, 2)
await Deno.writeTextFile('mac-device-identifiers.json', json)

Expand Down

0 comments on commit ed080c7

Please sign in to comment.