adding urlsList to the crawling

This commit is contained in:
Kfir Dayan 2023-04-18 22:00:28 +03:00
parent 9d3e39683c
commit 75dc340ec5

View file

@ -10,6 +10,7 @@ const environment = process.env.NODE_ENV || 'development';
@Injectable()
export class CrawlerService {
async crawl(url: string): Promise<any> {
console.log("start crawl", url);
const browser = await puppeteer.launch({ headless: true, args: ['--no-sandbox'] });
const page = await browser.newPage();
const domain = this.extractDomain(url);
@ -19,6 +20,7 @@ export class CrawlerService {
mkdirSync(directory);
}
// STYLESHEETS //
console.log("start stylesheets")
const stylesheetsUrls = await page.$$eval('link[rel="stylesheet"]', links => links.map(link => link.href));
let cssDir = `${directory}/css/`
const cssSheetsLocation = await this.downloadFiles(stylesheetsUrls, cssDir);
@ -42,10 +44,14 @@ export class CrawlerService {
// URLS //
const urls = await page.$$eval('a', links => links.map(link => link.href));
const urlsList = urls.filter((url) => url.startsWith('http'));
console.log(urlsList);
await browser.close();
return {
cssSheetsLocation,
scriptsSheetsLocation
scriptsSheetsLocation,
urlsList
}
}