This commit is contained in:
Kfir Dayan 2023-04-18 10:51:46 +03:00
parent daa1f363c8
commit d81f21ee64

View file

@ -12,16 +12,55 @@ export class CrawlerService {
const page = await browser.newPage(); const page = await browser.newPage();
await page.goto(url); await page.goto(url);
const directory = `crawler_assets/${this.extractDomain(url)}/`; const directory = `crawler_assets/${this.extractDomain(url)}/`;
console.log(directory)
if (!fs.existsSync(directory)) { if (!fs.existsSync(directory)) {
mkdirSync(directory); mkdirSync(directory);
} }
await page.pdf({path: `${directory}/page.pdf`, format: 'A4'}); await page.pdf({path: `${directory}/page.pdf`});
// extract stylesheets
// svae all stylesheets to disk
const stylesheetsUrls = await page.$$eval('link[rel="stylesheet"]', links => links.map(link => link.href));
console.log(stylesheetsUrls)
stylesheetsUrls.forEach(async (stylesheetUrl) => {
if(!stylesheetUrl.startsWith('http')) return;
// console.log(stylesheetUrl)
const stylesheet = await page.goto(stylesheetUrl);
// const content = await stylesheet.text();
// console.log(content)
});
// const content = await page.content();
// console.log(content)
// console.log(content)
// stylesheets.forEach(async (stylesheetUrl) => {
// if(!stylesheetUrl.startsWith('http')) return;
// const stylesheet = await page.goto(stylesheetUrl);
// const content = await stylesheet.text();
// console.log(content)
// console.log(stylesheet)
// const response = await page.goto(stylesheet);
// const content = await response.text();
// console.log(content)
// });
// extract scripts
// save all scripts to disk
await page.screenshot({path: `${directory}/screenshot.png`}); await page.screenshot({path: `${directory}/screenshot.png`});
await browser.close(); await browser.close();
} }
extractDomain(urlString) { extractDomain(urlString: string) {
const url = new URL(urlString); const url = new URL(urlString);
return url.hostname; return url.hostname;
} }