poppeteer added
This commit is contained in:
parent
a130f9065f
commit
37dcac866c
5 changed files with 529 additions and 149 deletions
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -3,6 +3,10 @@
|
||||||
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
|
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
|
||||||
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
|
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
|
||||||
|
|
||||||
|
#poppeteer's output
|
||||||
|
crawler_assets/**/*
|
||||||
|
|
||||||
|
|
||||||
# User-specific stuff:
|
# User-specific stuff:
|
||||||
.idea/**/workspace.xml
|
.idea/**/workspace.xml
|
||||||
.idea/**/tasks.xml
|
.idea/**/tasks.xml
|
||||||
|
|
646
package-lock.json
generated
646
package-lock.json
generated
File diff suppressed because it is too large
Load diff
|
@ -22,6 +22,7 @@
|
||||||
"@nestjs/common": "^9.0.0",
|
"@nestjs/common": "^9.0.0",
|
||||||
"@nestjs/core": "^9.0.5",
|
"@nestjs/core": "^9.0.5",
|
||||||
"@nestjs/platform-express": "^9.0.0",
|
"@nestjs/platform-express": "^9.0.0",
|
||||||
|
"puppeteer": "^19.9.1",
|
||||||
"reflect-metadata": "^0.1.13",
|
"reflect-metadata": "^0.1.13",
|
||||||
"rxjs": "^7.5.5"
|
"rxjs": "^7.5.5"
|
||||||
},
|
},
|
||||||
|
|
|
@ -3,15 +3,9 @@ import { CrawlerService } from '../crawler/crawler.service';
|
||||||
|
|
||||||
@Controller('/')
|
@Controller('/')
|
||||||
export class ApiController {
|
export class ApiController {
|
||||||
|
|
||||||
constructor(private crawlerService: CrawlerService) {}
|
constructor(private crawlerService: CrawlerService) {}
|
||||||
// Have an HTTP endpoint, "/crawl" with a JSON body "{ url: string }" that activates the crawling on the specified website.
|
|
||||||
// this route should accept a POST request
|
|
||||||
@Post('crawl')
|
@Post('crawl')
|
||||||
async crawl(@Body() body: { url: string }) {
|
async crawl(@Body() body: { url: string }) {
|
||||||
return this.crawlerService.crawl(body.url);
|
return this.crawlerService.crawl(body.url);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,9 +1,28 @@
|
||||||
import { Injectable } from '@nestjs/common';
|
import { Injectable } from '@nestjs/common';
|
||||||
|
import { mkdirSync } from 'fs';
|
||||||
|
import * as fs from 'fs';
|
||||||
|
import puppeteer from 'puppeteer';
|
||||||
|
import { URL } from 'url';
|
||||||
|
|
||||||
@Injectable()
|
@Injectable()
|
||||||
export class CrawlerService {
|
export class CrawlerService {
|
||||||
|
|
||||||
async crawl(url: string){
|
async crawl(url: string){
|
||||||
return `Crawling... ${url}`;
|
const browser = await puppeteer.launch();
|
||||||
|
const page = await browser.newPage();
|
||||||
|
await page.goto(url);
|
||||||
|
const directory = `crawler_assets/${this.extractDomain(url)}/`;
|
||||||
|
console.log(directory)
|
||||||
|
if (!fs.existsSync(directory)) {
|
||||||
|
mkdirSync(directory);
|
||||||
|
}
|
||||||
|
await page.pdf({path: `${directory}/page.pdf`, format: 'A4'});
|
||||||
|
await page.screenshot({path: `${directory}/screenshot.png`});
|
||||||
|
await browser.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
extractDomain(urlString) {
|
||||||
|
const url = new URL(urlString);
|
||||||
|
return url.hostname;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue