downloading scriptsheets
This commit is contained in:
parent
7385d5b6ab
commit
e4d982891f
1 changed files with 35 additions and 27 deletions
|
@ -5,22 +5,24 @@ import puppeteer from 'puppeteer';
|
||||||
import { URL } from 'url';
|
import { URL } from 'url';
|
||||||
import axios from 'axios';
|
import axios from 'axios';
|
||||||
|
|
||||||
|
interface ScreenshotResponse {
|
||||||
|
buffer: Buffer;
|
||||||
|
path?: string;
|
||||||
|
}
|
||||||
|
|
||||||
@Injectable()
|
@Injectable()
|
||||||
export class CrawlerService {
|
export class CrawlerService {
|
||||||
|
|
||||||
async crawl(url: string){
|
async crawl(url: string){
|
||||||
const browser = await puppeteer.launch();
|
const browser = await puppeteer.launch();
|
||||||
const page = await browser.newPage();
|
const page = await browser.newPage();
|
||||||
|
const domain = this.extractDomain(url);
|
||||||
await page.goto(url);
|
await page.goto(url);
|
||||||
const directory = `crawler_assets/${this.extractDomain(url)}/`;
|
const directory = `crawler_assets/${domain}/`;
|
||||||
if (!fs.existsSync(directory)) {
|
if (!fs.existsSync(directory)) {
|
||||||
mkdirSync(directory);
|
mkdirSync(directory);
|
||||||
}
|
}
|
||||||
await page.pdf({path: `${directory}/page.pdf`});
|
// STYLESHEETS //
|
||||||
|
|
||||||
// extract stylesheets
|
|
||||||
// svae all stylesheets to disk
|
|
||||||
|
|
||||||
const stylesheetsUrls = await page.$$eval('link[rel="stylesheet"]', links => links.map(link => link.href));
|
const stylesheetsUrls = await page.$$eval('link[rel="stylesheet"]', links => links.map(link => link.href));
|
||||||
let cssDir = `${directory}/css/`
|
let cssDir = `${directory}/css/`
|
||||||
if (!fs.existsSync(cssDir)) {
|
if (!fs.existsSync(cssDir)) {
|
||||||
|
@ -32,33 +34,39 @@ export class CrawlerService {
|
||||||
const content = response.data;
|
const content = response.data;
|
||||||
fs.writeFileSync(`${cssDir}${stylesheetUrl.split('/').pop()}`, content);
|
fs.writeFileSync(`${cssDir}${stylesheetUrl.split('/').pop()}`, content);
|
||||||
});
|
});
|
||||||
|
// STYLESHEETS //
|
||||||
|
|
||||||
|
// SCRIPTS //
|
||||||
|
const scriptsUrls = await page.$$eval('script', scripts => scripts.map(script => script.src));
|
||||||
|
let scriptsDir = `${directory}/scripts/`
|
||||||
|
if (!fs.existsSync(scriptsDir)) {
|
||||||
|
mkdirSync(scriptsDir);
|
||||||
|
}
|
||||||
|
console.log(scriptsUrls)
|
||||||
|
|
||||||
|
scriptsUrls.forEach(async (scriptUrl) => {
|
||||||
|
if (!scriptUrl.startsWith('http')) return;
|
||||||
|
const response = await axios.get(scriptUrl);
|
||||||
|
const content = response.data;
|
||||||
|
fs.writeFileSync(`${scriptsDir}${scriptUrl.split('/').pop()}`, content);
|
||||||
|
});
|
||||||
|
// SCRIPTS //
|
||||||
|
|
||||||
|
// SCREENSHOT //
|
||||||
|
const screenshotBuffer: Buffer = await page.screenshot();
|
||||||
|
fs.writeFile(`${directory}screenshot.png`, screenshotBuffer, (err) => {
|
||||||
|
if (err) throw err;
|
||||||
|
console.log(`Screenshot saved! ${directory}screenshot.png`);
|
||||||
|
});
|
||||||
|
// SCREENSHOT //
|
||||||
|
|
||||||
|
// URLS //
|
||||||
|
const urls = await page.$$eval('a', links => links.map(link => link.href));
|
||||||
|
console.log(urls);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// const content = await page.content();
|
|
||||||
// console.log(content)
|
|
||||||
// console.log(content)
|
|
||||||
// stylesheets.forEach(async (stylesheetUrl) => {
|
|
||||||
// if(!stylesheetUrl.startsWith('http')) return;
|
|
||||||
|
|
||||||
// const stylesheet = await page.goto(stylesheetUrl);
|
|
||||||
// const content = await stylesheet.text();
|
|
||||||
// console.log(content)
|
|
||||||
// console.log(stylesheet)
|
|
||||||
// const response = await page.goto(stylesheet);
|
|
||||||
// const content = await response.text();
|
|
||||||
// console.log(content)
|
|
||||||
// });
|
|
||||||
|
|
||||||
// extract scripts
|
|
||||||
// save all scripts to disk
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
await page.screenshot({path: `${directory}/screenshot.png`});
|
|
||||||
await browser.close();
|
await browser.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue