From 75dc340ec51a4b4c1ac45869d82470f3fc0696db Mon Sep 17 00:00:00 2001 From: Kfir Dayan Date: Tue, 18 Apr 2023 22:00:28 +0300 Subject: [PATCH] adding urlsList to the crawling --- src/crawler/crawler.service.ts | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/crawler/crawler.service.ts b/src/crawler/crawler.service.ts index 283b4a6..b88760d 100644 --- a/src/crawler/crawler.service.ts +++ b/src/crawler/crawler.service.ts @@ -10,6 +10,7 @@ const environment = process.env.NODE_ENV || 'development'; @Injectable() export class CrawlerService { async crawl(url: string): Promise { + console.log("start crawl", url); const browser = await puppeteer.launch({ headless: true, args: ['--no-sandbox'] }); const page = await browser.newPage(); const domain = this.extractDomain(url); @@ -19,6 +20,7 @@ export class CrawlerService { mkdirSync(directory); } // STYLESHEETS // + console.log("start stylesheets") const stylesheetsUrls = await page.$$eval('link[rel="stylesheet"]', links => links.map(link => link.href)); let cssDir = `${directory}/css/` const cssSheetsLocation = await this.downloadFiles(stylesheetsUrls, cssDir); @@ -42,10 +44,14 @@ export class CrawlerService { // URLS // const urls = await page.$$eval('a', links => links.map(link => link.href)); + const urlsList = urls.filter((url) => url.startsWith('http')); + console.log(urlsList); + await browser.close(); return { cssSheetsLocation, - scriptsSheetsLocation + scriptsSheetsLocation, + urlsList } }