From d81f21ee6460ec3a9b917e91a2e4034026ee4895 Mon Sep 17 00:00:00 2001 From: Kfir Dayan Date: Tue, 18 Apr 2023 10:51:46 +0300 Subject: [PATCH] wip --- src/crawler/crawler.service.ts | 45 +++++++++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/src/crawler/crawler.service.ts b/src/crawler/crawler.service.ts index 196e550..12a58f2 100644 --- a/src/crawler/crawler.service.ts +++ b/src/crawler/crawler.service.ts @@ -12,16 +12,55 @@ export class CrawlerService { const page = await browser.newPage(); await page.goto(url); const directory = `crawler_assets/${this.extractDomain(url)}/`; - console.log(directory) if (!fs.existsSync(directory)) { mkdirSync(directory); } - await page.pdf({path: `${directory}/page.pdf`, format: 'A4'}); + await page.pdf({path: `${directory}/page.pdf`}); + + // extract stylesheets + // svae all stylesheets to disk + + const stylesheetsUrls = await page.$$eval('link[rel="stylesheet"]', links => links.map(link => link.href)); + console.log(stylesheetsUrls) + stylesheetsUrls.forEach(async (stylesheetUrl) => { + if(!stylesheetUrl.startsWith('http')) return; + // console.log(stylesheetUrl) + const stylesheet = await page.goto(stylesheetUrl); + // const content = await stylesheet.text(); + // console.log(content) + + }); + + + + + // const content = await page.content(); + // console.log(content) + // console.log(content) + // stylesheets.forEach(async (stylesheetUrl) => { + // if(!stylesheetUrl.startsWith('http')) return; + + // const stylesheet = await page.goto(stylesheetUrl); + // const content = await stylesheet.text(); + // console.log(content) + // console.log(stylesheet) + // const response = await page.goto(stylesheet); + // const content = await response.text(); + // console.log(content) + // }); + + // extract scripts + // save all scripts to disk + + + + + await page.screenshot({path: `${directory}/screenshot.png`}); await browser.close(); } - extractDomain(urlString) { + extractDomain(urlString: string) { const url = new URL(urlString); return url.hostname; }