poppeteer added
This commit is contained in:
parent
a130f9065f
commit
37dcac866c
5 changed files with 529 additions and 149 deletions
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -3,6 +3,10 @@
|
|||
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
|
||||
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
|
||||
|
||||
#poppeteer's output
|
||||
crawler_assets/**/*
|
||||
|
||||
|
||||
# User-specific stuff:
|
||||
.idea/**/workspace.xml
|
||||
.idea/**/tasks.xml
|
||||
|
|
646
package-lock.json
generated
646
package-lock.json
generated
File diff suppressed because it is too large
Load diff
|
@ -22,6 +22,7 @@
|
|||
"@nestjs/common": "^9.0.0",
|
||||
"@nestjs/core": "^9.0.5",
|
||||
"@nestjs/platform-express": "^9.0.0",
|
||||
"puppeteer": "^19.9.1",
|
||||
"reflect-metadata": "^0.1.13",
|
||||
"rxjs": "^7.5.5"
|
||||
},
|
||||
|
|
|
@ -3,15 +3,9 @@ import { CrawlerService } from '../crawler/crawler.service';
|
|||
|
||||
@Controller('/')
|
||||
export class ApiController {
|
||||
|
||||
constructor(private crawlerService: CrawlerService) {}
|
||||
// Have an HTTP endpoint, "/crawl" with a JSON body "{ url: string }" that activates the crawling on the specified website.
|
||||
// this route should accept a POST request
|
||||
@Post('crawl')
|
||||
async crawl(@Body() body: { url: string }) {
|
||||
return this.crawlerService.crawl(body.url);
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -1,9 +1,28 @@
|
|||
import { Injectable } from '@nestjs/common';
|
||||
import { mkdirSync } from 'fs';
|
||||
import * as fs from 'fs';
|
||||
import puppeteer from 'puppeteer';
|
||||
import { URL } from 'url';
|
||||
|
||||
@Injectable()
|
||||
export class CrawlerService {
|
||||
|
||||
async crawl(url: string){
|
||||
return `Crawling... ${url}`;
|
||||
const browser = await puppeteer.launch();
|
||||
const page = await browser.newPage();
|
||||
await page.goto(url);
|
||||
const directory = `crawler_assets/${this.extractDomain(url)}/`;
|
||||
console.log(directory)
|
||||
if (!fs.existsSync(directory)) {
|
||||
mkdirSync(directory);
|
||||
}
|
||||
await page.pdf({path: `${directory}/page.pdf`, format: 'A4'});
|
||||
await page.screenshot({path: `${directory}/screenshot.png`});
|
||||
await browser.close();
|
||||
}
|
||||
|
||||
extractDomain(urlString) {
|
||||
const url = new URL(urlString);
|
||||
return url.hostname;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue