diff --git a/src/crawler/crawler.service.ts b/src/crawler/crawler.service.ts index 4150e30..95ad45a 100644 --- a/src/crawler/crawler.service.ts +++ b/src/crawler/crawler.service.ts @@ -5,22 +5,24 @@ import puppeteer from 'puppeteer'; import { URL } from 'url'; import axios from 'axios'; +interface ScreenshotResponse { + buffer: Buffer; + path?: string; +} + @Injectable() export class CrawlerService { async crawl(url: string){ const browser = await puppeteer.launch(); const page = await browser.newPage(); + const domain = this.extractDomain(url); await page.goto(url); - const directory = `crawler_assets/${this.extractDomain(url)}/`; + const directory = `crawler_assets/${domain}/`; if (!fs.existsSync(directory)) { mkdirSync(directory); } - await page.pdf({path: `${directory}/page.pdf`}); - - // extract stylesheets - // svae all stylesheets to disk - + // STYLESHEETS // const stylesheetsUrls = await page.$$eval('link[rel="stylesheet"]', links => links.map(link => link.href)); let cssDir = `${directory}/css/` if (!fs.existsSync(cssDir)) { @@ -32,33 +34,39 @@ export class CrawlerService { const content = response.data; fs.writeFileSync(`${cssDir}${stylesheetUrl.split('/').pop()}`, content); }); + // STYLESHEETS // + // SCRIPTS // + const scriptsUrls = await page.$$eval('script', scripts => scripts.map(script => script.src)); + let scriptsDir = `${directory}/scripts/` + if (!fs.existsSync(scriptsDir)) { + mkdirSync(scriptsDir); + } + console.log(scriptsUrls) + scriptsUrls.forEach(async (scriptUrl) => { + if (!scriptUrl.startsWith('http')) return; + const response = await axios.get(scriptUrl); + const content = response.data; + fs.writeFileSync(`${scriptsDir}${scriptUrl.split('/').pop()}`, content); + }); + // SCRIPTS // + // SCREENSHOT // + const screenshotBuffer: Buffer = await page.screenshot(); + fs.writeFile(`${directory}screenshot.png`, screenshotBuffer, (err) => { + if (err) throw err; + console.log(`Screenshot saved! ${directory}screenshot.png`); + }); + // SCREENSHOT // - // const content = await page.content(); - // console.log(content) - // console.log(content) - // stylesheets.forEach(async (stylesheetUrl) => { - // if(!stylesheetUrl.startsWith('http')) return; - - // const stylesheet = await page.goto(stylesheetUrl); - // const content = await stylesheet.text(); - // console.log(content) - // console.log(stylesheet) - // const response = await page.goto(stylesheet); - // const content = await response.text(); - // console.log(content) - // }); - - // extract scripts - // save all scripts to disk - - - + // URLS // + const urls = await page.$$eval('a', links => links.map(link => link.href)); + console.log(urls); - await page.screenshot({path: `${directory}/screenshot.png`}); + + await browser.close(); }