adding urlsList to the crawling
This commit is contained in:
parent
9d3e39683c
commit
75dc340ec5
1 changed files with 7 additions and 1 deletions
|
@ -10,6 +10,7 @@ const environment = process.env.NODE_ENV || 'development';
|
||||||
@Injectable()
|
@Injectable()
|
||||||
export class CrawlerService {
|
export class CrawlerService {
|
||||||
async crawl(url: string): Promise<any> {
|
async crawl(url: string): Promise<any> {
|
||||||
|
console.log("start crawl", url);
|
||||||
const browser = await puppeteer.launch({ headless: true, args: ['--no-sandbox'] });
|
const browser = await puppeteer.launch({ headless: true, args: ['--no-sandbox'] });
|
||||||
const page = await browser.newPage();
|
const page = await browser.newPage();
|
||||||
const domain = this.extractDomain(url);
|
const domain = this.extractDomain(url);
|
||||||
|
@ -19,6 +20,7 @@ export class CrawlerService {
|
||||||
mkdirSync(directory);
|
mkdirSync(directory);
|
||||||
}
|
}
|
||||||
// STYLESHEETS //
|
// STYLESHEETS //
|
||||||
|
console.log("start stylesheets")
|
||||||
const stylesheetsUrls = await page.$$eval('link[rel="stylesheet"]', links => links.map(link => link.href));
|
const stylesheetsUrls = await page.$$eval('link[rel="stylesheet"]', links => links.map(link => link.href));
|
||||||
let cssDir = `${directory}/css/`
|
let cssDir = `${directory}/css/`
|
||||||
const cssSheetsLocation = await this.downloadFiles(stylesheetsUrls, cssDir);
|
const cssSheetsLocation = await this.downloadFiles(stylesheetsUrls, cssDir);
|
||||||
|
@ -42,10 +44,14 @@ export class CrawlerService {
|
||||||
|
|
||||||
// URLS //
|
// URLS //
|
||||||
const urls = await page.$$eval('a', links => links.map(link => link.href));
|
const urls = await page.$$eval('a', links => links.map(link => link.href));
|
||||||
|
const urlsList = urls.filter((url) => url.startsWith('http'));
|
||||||
|
console.log(urlsList);
|
||||||
|
|
||||||
await browser.close();
|
await browser.close();
|
||||||
return {
|
return {
|
||||||
cssSheetsLocation,
|
cssSheetsLocation,
|
||||||
scriptsSheetsLocation
|
scriptsSheetsLocation,
|
||||||
|
urlsList
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue