adding urlsList to the crawling

This commit is contained in:
Kfir Dayan 2023-04-18 22:00:28 +03:00
parent 9d3e39683c
commit 75dc340ec5

View file

@ -10,6 +10,7 @@ const environment = process.env.NODE_ENV || 'development';
@Injectable() @Injectable()
export class CrawlerService { export class CrawlerService {
async crawl(url: string): Promise<any> { async crawl(url: string): Promise<any> {
console.log("start crawl", url);
const browser = await puppeteer.launch({ headless: true, args: ['--no-sandbox'] }); const browser = await puppeteer.launch({ headless: true, args: ['--no-sandbox'] });
const page = await browser.newPage(); const page = await browser.newPage();
const domain = this.extractDomain(url); const domain = this.extractDomain(url);
@ -19,6 +20,7 @@ export class CrawlerService {
mkdirSync(directory); mkdirSync(directory);
} }
// STYLESHEETS // // STYLESHEETS //
console.log("start stylesheets")
const stylesheetsUrls = await page.$$eval('link[rel="stylesheet"]', links => links.map(link => link.href)); const stylesheetsUrls = await page.$$eval('link[rel="stylesheet"]', links => links.map(link => link.href));
let cssDir = `${directory}/css/` let cssDir = `${directory}/css/`
const cssSheetsLocation = await this.downloadFiles(stylesheetsUrls, cssDir); const cssSheetsLocation = await this.downloadFiles(stylesheetsUrls, cssDir);
@ -42,10 +44,14 @@ export class CrawlerService {
// URLS // // URLS //
const urls = await page.$$eval('a', links => links.map(link => link.href)); const urls = await page.$$eval('a', links => links.map(link => link.href));
const urlsList = urls.filter((url) => url.startsWith('http'));
console.log(urlsList);
await browser.close(); await browser.close();
return { return {
cssSheetsLocation, cssSheetsLocation,
scriptsSheetsLocation scriptsSheetsLocation,
urlsList
} }
} }