downloading scriptsheets

This commit is contained in:
Kfir Dayan 2023-04-18 11:10:24 +03:00
parent 7385d5b6ab
commit e4d982891f

View file

@ -5,22 +5,24 @@ import puppeteer from 'puppeteer';
import { URL } from 'url';
import axios from 'axios';
interface ScreenshotResponse {
buffer: Buffer;
path?: string;
}
@Injectable()
export class CrawlerService {
async crawl(url: string){
const browser = await puppeteer.launch();
const page = await browser.newPage();
const domain = this.extractDomain(url);
await page.goto(url);
const directory = `crawler_assets/${this.extractDomain(url)}/`;
const directory = `crawler_assets/${domain}/`;
if (!fs.existsSync(directory)) {
mkdirSync(directory);
}
await page.pdf({path: `${directory}/page.pdf`});
// extract stylesheets
// svae all stylesheets to disk
// STYLESHEETS //
const stylesheetsUrls = await page.$$eval('link[rel="stylesheet"]', links => links.map(link => link.href));
let cssDir = `${directory}/css/`
if (!fs.existsSync(cssDir)) {
@ -32,33 +34,39 @@ export class CrawlerService {
const content = response.data;
fs.writeFileSync(`${cssDir}${stylesheetUrl.split('/').pop()}`, content);
});
// STYLESHEETS //
// SCRIPTS //
const scriptsUrls = await page.$$eval('script', scripts => scripts.map(script => script.src));
let scriptsDir = `${directory}/scripts/`
if (!fs.existsSync(scriptsDir)) {
mkdirSync(scriptsDir);
}
console.log(scriptsUrls)
scriptsUrls.forEach(async (scriptUrl) => {
if (!scriptUrl.startsWith('http')) return;
const response = await axios.get(scriptUrl);
const content = response.data;
fs.writeFileSync(`${scriptsDir}${scriptUrl.split('/').pop()}`, content);
});
// SCRIPTS //
// SCREENSHOT //
const screenshotBuffer: Buffer = await page.screenshot();
fs.writeFile(`${directory}screenshot.png`, screenshotBuffer, (err) => {
if (err) throw err;
console.log(`Screenshot saved! ${directory}screenshot.png`);
});
// SCREENSHOT //
// URLS //
const urls = await page.$$eval('a', links => links.map(link => link.href));
console.log(urls);
// const content = await page.content();
// console.log(content)
// console.log(content)
// stylesheets.forEach(async (stylesheetUrl) => {
// if(!stylesheetUrl.startsWith('http')) return;
// const stylesheet = await page.goto(stylesheetUrl);
// const content = await stylesheet.text();
// console.log(content)
// console.log(stylesheet)
// const response = await page.goto(stylesheet);
// const content = await response.text();
// console.log(content)
// });
// extract scripts
// save all scripts to disk
await page.screenshot({path: `${directory}/screenshot.png`});
await browser.close();
}