poppeteer added

This commit is contained in:
Kfir Dayan 2023-04-17 19:45:40 +03:00
parent a130f9065f
commit 37dcac866c
5 changed files with 529 additions and 149 deletions

4
.gitignore vendored
View file

@ -3,6 +3,10 @@
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
#poppeteer's output
crawler_assets/**/*
# User-specific stuff: # User-specific stuff:
.idea/**/workspace.xml .idea/**/workspace.xml
.idea/**/tasks.xml .idea/**/tasks.xml

646
package-lock.json generated

File diff suppressed because it is too large Load diff

View file

@ -22,6 +22,7 @@
"@nestjs/common": "^9.0.0", "@nestjs/common": "^9.0.0",
"@nestjs/core": "^9.0.5", "@nestjs/core": "^9.0.5",
"@nestjs/platform-express": "^9.0.0", "@nestjs/platform-express": "^9.0.0",
"puppeteer": "^19.9.1",
"reflect-metadata": "^0.1.13", "reflect-metadata": "^0.1.13",
"rxjs": "^7.5.5" "rxjs": "^7.5.5"
}, },

View file

@ -3,15 +3,9 @@ import { CrawlerService } from '../crawler/crawler.service';
@Controller('/') @Controller('/')
export class ApiController { export class ApiController {
constructor(private crawlerService: CrawlerService) {} constructor(private crawlerService: CrawlerService) {}
// Have an HTTP endpoint, "/crawl" with a JSON body "{ url: string }" that activates the crawling on the specified website.
// this route should accept a POST request
@Post('crawl') @Post('crawl')
async crawl(@Body() body: { url: string }) { async crawl(@Body() body: { url: string }) {
return this.crawlerService.crawl(body.url); return this.crawlerService.crawl(body.url);
} }
} }

View file

@ -1,9 +1,28 @@
import { Injectable } from '@nestjs/common'; import { Injectable } from '@nestjs/common';
import { mkdirSync } from 'fs';
import * as fs from 'fs';
import puppeteer from 'puppeteer';
import { URL } from 'url';
@Injectable() @Injectable()
export class CrawlerService { export class CrawlerService {
async crawl(url: string){ async crawl(url: string){
return `Crawling... ${url}`; const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
const directory = `crawler_assets/${this.extractDomain(url)}/`;
console.log(directory)
if (!fs.existsSync(directory)) {
mkdirSync(directory);
}
await page.pdf({path: `${directory}/page.pdf`, format: 'A4'});
await page.screenshot({path: `${directory}/screenshot.png`});
await browser.close();
}
extractDomain(urlString) {
const url = new URL(urlString);
return url.hostname;
} }
} }