Compare commits
11 commits
e775a58400
...
8b8e79d159
Author | SHA1 | Date | |
---|---|---|---|
8b8e79d159 | |||
43b598689c | |||
75c4f55c01 | |||
c57a13d8d8 | |||
15c718dd0a | |||
9044d96a55 | |||
275d22e045 | |||
75dc340ec5 | |||
9d3e39683c | |||
4b7d4d9022 | |||
acb563b3a0 |
16 changed files with 2219 additions and 776 deletions
1
.dockerignore
Normal file
1
.dockerignore
Normal file
|
@ -0,0 +1 @@
|
|||
node_modules
|
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -4,7 +4,7 @@
|
|||
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
|
||||
|
||||
#poppeteer's output
|
||||
crawler_assets/**/*
|
||||
sites_assets/**/*
|
||||
|
||||
|
||||
# User-specific stuff:
|
||||
|
|
18
Dockerfile
18
Dockerfile
|
@ -1,13 +1,14 @@
|
|||
FROM ubuntu:20.04 AS base
|
||||
FROM node:14-alpine AS base
|
||||
|
||||
# Set non-interactive mode
|
||||
ENV DEBIAN_FRONTEND noninteractive
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV NODE_ENV=production
|
||||
|
||||
# Install required packages
|
||||
RUN apt-get update && \
|
||||
curl -sL https://deb.nodesource.com/setup_14.x | bash - && \
|
||||
apt-get install -y nodejs chromium-browser
|
||||
# Install chromium
|
||||
RUN apk add --no-cache chromium chromium-chromedriver
|
||||
|
||||
# Create a symbolic link for google-chrome
|
||||
RUN ln -s /usr/bin/chromium-browser /usr/bin/google-chrome
|
||||
|
||||
# Set the working directory
|
||||
WORKDIR /app
|
||||
|
@ -16,10 +17,11 @@ WORKDIR /app
|
|||
COPY package*.json ./
|
||||
|
||||
# Install dependencies
|
||||
RUN npm install
|
||||
RUN npm install -g npm@9.6.3 && \
|
||||
npm install
|
||||
|
||||
# Copy the source code
|
||||
COPY . .
|
||||
|
||||
# Start the application
|
||||
CMD ["npm", "run", "start:prod"]
|
||||
CMD ["npm", "run", "start"]
|
20
README.md
20
README.md
|
@ -6,9 +6,7 @@ The crawler is a simple crawler that crawls the web and stores the results in a
|
|||
## Crawler ##
|
||||
|
||||
|
||||
|
||||
### Usage ###
|
||||
|
||||
Post a JSON object to the crawler with the following format:
|
||||
|
||||
`domain.com/crawl`
|
||||
|
@ -17,4 +15,20 @@ Post a JSON object to the crawler with the following format:
|
|||
}
|
||||
|
||||
The crawler will then crawl the given url and store the results in a database and assets in a file system
|
||||
`crawler_assests/www.example.com/`.
|
||||
`crawler_assests/www.example.com/`.
|
||||
|
||||
|
||||
# API #
|
||||
|
||||
The API is a simple API that serves the results of the crawler.
|
||||
|
||||
# Routes #
|
||||
## GET ##
|
||||
/sites - Returns a list of all sites
|
||||
/sites/:id - Returns the site object for the given site Id
|
||||
sites/domain/:domain - Returns the domain object for the given domain
|
||||
## DELETE ##
|
||||
/sites/:id - Deletes the site object for the given site Id
|
||||
sites/domain/:domain - Deletes the domain object for the given domain
|
||||
## Post ##
|
||||
sites/:id - Updates the site object for the given site Id
|
|
@ -5,17 +5,17 @@ services:
|
|||
ports:
|
||||
- '3000:3000'
|
||||
depends_on:
|
||||
- db
|
||||
- mongo
|
||||
environment:
|
||||
- MONGO_URL=mongodb://db:27017/nestjs
|
||||
- MONGO_URL=mongodb://mongo:27017/mydatabase
|
||||
networks:
|
||||
- appnet
|
||||
db:
|
||||
mongo:
|
||||
image: mongo
|
||||
environment:
|
||||
- MONGO_INITDB_DATABASE=nestjs
|
||||
- MONGO_INITDB_ROOT_USERNAME=admin
|
||||
- MONGO_INITDB_ROOT_PASSWORD=adminpassword
|
||||
- MONGO_INITDB_DATABASE=mydatabase
|
||||
- MONGO_INITDB_ROOT_USERNAME=akamai
|
||||
- MONGO_INITDB_ROOT_PASSWORD=password
|
||||
volumes:
|
||||
- dbdata:/data/db
|
||||
networks:
|
||||
|
|
2763
package-lock.json
generated
2763
package-lock.json
generated
File diff suppressed because it is too large
Load diff
|
@ -25,13 +25,14 @@
|
|||
"@nestjs/platform-express": "^9.0.0",
|
||||
"axios": "^1.3.5",
|
||||
"dotenv": "^16.0.3",
|
||||
"minimatch": "^9.0.0",
|
||||
"mongoose": "^7.0.4",
|
||||
"puppeteer": "^19.9.1",
|
||||
"reflect-metadata": "^0.1.13",
|
||||
"@nestjs/cli": "^9.0.0",
|
||||
"rxjs": "^7.5.5"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@nestjs/cli": "^9.0.0",
|
||||
"@nestjs/schematics": "^9.0.0",
|
||||
"@nestjs/testing": "^9.0.0",
|
||||
"@types/express": "^4.17.13",
|
||||
|
|
|
@ -1,21 +1,68 @@
|
|||
import { Body, Controller, Post } from '@nestjs/common';
|
||||
import { Body, Controller, Delete, Get, Param, Post } from '@nestjs/common';
|
||||
import { CrawlerService } from '../crawler/crawler.service';
|
||||
import { DbService } from '../db/db.service';
|
||||
import { InjectModel } from '@nestjs/mongoose';
|
||||
import { Model } from 'mongoose';
|
||||
import { Site } from 'src/interfaces/site.interface';
|
||||
|
||||
@Controller('/')
|
||||
export class ApiController {
|
||||
constructor(private crawlerService: CrawlerService, private DbService: DbService) {}
|
||||
|
||||
constructor(private crawlerService: CrawlerService, @InjectModel('Site') private readonly siteModel: Model<Site> ) {}
|
||||
|
||||
@Post('crawl')
|
||||
async crawl(@Body() body: { url: string }) {
|
||||
|
||||
const results = this.crawlerService.crawl(body.url);
|
||||
results.then((data) => {
|
||||
console.log(data)
|
||||
this.DbService.insert(data, 'crawler');
|
||||
console.log("Done crawling !", data);
|
||||
const newSite = new this.siteModel(data);
|
||||
newSite.save().then((result) => {
|
||||
console.log("Site saved !", result);
|
||||
}).catch((err) => {
|
||||
console.log("Error saving site !", err.message);
|
||||
});
|
||||
}).catch((err) => {
|
||||
console.log("** Error crawling ! **", err);
|
||||
console.log(err);
|
||||
});
|
||||
return {
|
||||
message: 'Crawling in progress'
|
||||
message: 'Got your request for ' + body.url
|
||||
}
|
||||
}
|
||||
|
||||
// Get all
|
||||
@Get('sites')
|
||||
async getSites() {
|
||||
const sites = await this.siteModel.find().exec();
|
||||
return sites || {};
|
||||
}
|
||||
|
||||
// Get by id
|
||||
@Get('sites/:id')
|
||||
async getSite(@Param('id') id: string) {
|
||||
const site = await this.siteModel.findById(id).exec();
|
||||
return site || {};
|
||||
}
|
||||
|
||||
// Get by domain
|
||||
@Get('sites/domain/:domain')
|
||||
async getSiteByDomain(@Param('domain') domain: string) {
|
||||
const site = await this.siteModel.findOne({ domain }).exec();
|
||||
return site || {};
|
||||
}
|
||||
|
||||
// Delete by domain
|
||||
@Delete('sites/domain/:domain')
|
||||
async deleteSiteByDomain(@Param('domain') domain: string) {
|
||||
const site = await this.siteModel.findOneAndDelete({ domain }).exec();
|
||||
return site || {};
|
||||
}
|
||||
|
||||
// Delete by id
|
||||
@Delete('sites/:id')
|
||||
async deleteSite(@Param('id') id: string) {
|
||||
const site = await this.siteModel.findByIdAndDelete(id).exec();
|
||||
return site || {};
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,11 +1,13 @@
|
|||
import { Module } from '@nestjs/common';
|
||||
import { ApiController } from './api.controller';
|
||||
import { CrawlerService } from '../crawler/crawler.service';
|
||||
import { DbService } from '../db/db.service';
|
||||
import { SitesSchema } from '../schema/sites.schema';
|
||||
import { MongooseModule } from '@nestjs/mongoose';
|
||||
|
||||
@Module({
|
||||
imports: [MongooseModule.forFeature([{ name: 'Site', schema: SitesSchema }])],
|
||||
controllers: [ApiController],
|
||||
providers: [CrawlerService, DbService]
|
||||
providers: [CrawlerService]
|
||||
|
||||
})
|
||||
export class ApiModule {}
|
||||
|
|
|
@ -1,9 +1,20 @@
|
|||
import { Module } from '@nestjs/common';
|
||||
import { MongooseModule } from '@nestjs/mongoose';
|
||||
import { ApiModule } from './api/api.module';
|
||||
import { CrawlerModule } from './crawler/crawler.module';
|
||||
import { DbModule } from './db/db.module';
|
||||
const dotenv = require('dotenv');
|
||||
|
||||
dotenv.config();
|
||||
|
||||
@Module({
|
||||
imports: [ApiModule, CrawlerModule, DbModule]
|
||||
imports: [
|
||||
MongooseModule.forRoot(process.env.MONGO_URL, {
|
||||
useNewUrlParser: true,
|
||||
useUnifiedTopology: true,
|
||||
}),
|
||||
ApiModule,
|
||||
CrawlerModule
|
||||
]
|
||||
|
||||
})
|
||||
export class AppModule {}
|
||||
export class AppModule { }
|
||||
|
|
|
@ -1,9 +1,7 @@
|
|||
import { Module } from '@nestjs/common';
|
||||
import { CrawlerService } from './crawler.service';
|
||||
import { DbModule } from '../db/db.module';
|
||||
|
||||
@Module({
|
||||
imports: [DbModule],
|
||||
providers: [CrawlerService]
|
||||
})
|
||||
export class CrawlerModule {}
|
||||
|
|
|
@ -4,21 +4,31 @@ import * as fs from 'fs';
|
|||
import puppeteer from 'puppeteer';
|
||||
import { URL } from 'url';
|
||||
import axios from 'axios';
|
||||
|
||||
const environment = process.env.NODE_ENV || 'development';
|
||||
import { Site } from '../interfaces/site.interface';
|
||||
|
||||
@Injectable()
|
||||
export class CrawlerService {
|
||||
async crawl(url: string): Promise<any> {
|
||||
const browser = await puppeteer.launch({ headless: true, args: ['--no-sandbox'] });
|
||||
|
||||
constructor() {
|
||||
if (!fs.existsSync('sites_assets')) {
|
||||
mkdirSync('sites_assets');
|
||||
}
|
||||
}
|
||||
|
||||
async crawl(url: string): Promise<Site> {
|
||||
|
||||
console.log("start crawl website", url);
|
||||
const browser = await puppeteer.launch({ executablePath: '/usr/bin/chromium-browser', headless: true, args: ['--no-sandbox'] });
|
||||
const page = await browser.newPage();
|
||||
const domain = this.extractDomain(url);
|
||||
await page.goto(url);
|
||||
const directory = `crawler_assets/${domain}/`;
|
||||
const directory = `sites_assets/${domain}/`;
|
||||
if (!fs.existsSync(directory)) {
|
||||
mkdirSync(directory);
|
||||
}
|
||||
|
||||
// STYLESHEETS //
|
||||
console.log("start stylesheets")
|
||||
const stylesheetsUrls = await page.$$eval('link[rel="stylesheet"]', links => links.map(link => link.href));
|
||||
let cssDir = `${directory}/css/`
|
||||
const cssSheetsLocation = await this.downloadFiles(stylesheetsUrls, cssDir);
|
||||
|
@ -34,18 +44,28 @@ export class CrawlerService {
|
|||
|
||||
// SCREENSHOT //
|
||||
const screenshotBuffer: Buffer = await page.screenshot({ fullPage: true });
|
||||
fs.writeFile(`${directory}screenshot.png`, screenshotBuffer, (err) => {
|
||||
if (err) throw err;
|
||||
// console.log(`Screenshot saved! ${directory}screenshot.png`);
|
||||
await new Promise((resolve, reject) => {
|
||||
fs.writeFile(`${directory}screenshot.png`, screenshotBuffer, (err) => {
|
||||
if (err) {
|
||||
reject(err);
|
||||
} else {
|
||||
resolve(true);
|
||||
}
|
||||
});
|
||||
});
|
||||
// SCREENSHOT //
|
||||
|
||||
// URLS //
|
||||
const urls = await page.$$eval('a', links => links.map(link => link.href));
|
||||
const urlsList = urls.filter((url) => url.startsWith('http'));
|
||||
console.log(urlsList);
|
||||
|
||||
await browser.close();
|
||||
return {
|
||||
domain,
|
||||
cssSheetsLocation,
|
||||
scriptsSheetsLocation
|
||||
scriptsSheetsLocation,
|
||||
urlsList
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -68,8 +88,7 @@ export class CrawlerService {
|
|||
if (fileLocation.length > 10) {
|
||||
fileLocation = fileLocation.substring(0, 10);
|
||||
}
|
||||
console.log("fileLocation: " + fileLocation)
|
||||
finalUrls.push(fileLocation);
|
||||
finalUrls.push(`${path}${fileLocation}`);
|
||||
console.log(`Saving file ${path}${fileLocation}`);
|
||||
fs.writeFileSync(`${path}${fileLocation}`, content);
|
||||
})
|
||||
|
|
|
@ -1,7 +0,0 @@
|
|||
import { Module } from '@nestjs/common';
|
||||
import { DbService } from './db.service';
|
||||
|
||||
@Module({
|
||||
providers: [DbService]
|
||||
})
|
||||
export class DbModule {}
|
|
@ -1,17 +0,0 @@
|
|||
import { Injectable } from '@nestjs/common';
|
||||
|
||||
@Injectable()
|
||||
export class DbService {
|
||||
constructor() {
|
||||
console.log(`DbService constructor`);
|
||||
}
|
||||
|
||||
insert(data: {
|
||||
cssSheetsLocation: string[];
|
||||
scriptsSheetsLocation: string[];
|
||||
}, collection: string) {
|
||||
|
||||
console.log({data, collection});
|
||||
|
||||
}
|
||||
}
|
6
src/interfaces/site.interface.ts
Normal file
6
src/interfaces/site.interface.ts
Normal file
|
@ -0,0 +1,6 @@
|
|||
export interface Site {
|
||||
domain: string;
|
||||
cssSheetsLocation: string[];
|
||||
scriptsSheetsLocation: string[];
|
||||
urlsList: string[];
|
||||
}
|
21
src/schema/sites.schema.ts
Normal file
21
src/schema/sites.schema.ts
Normal file
|
@ -0,0 +1,21 @@
|
|||
import { Prop, Schema, SchemaFactory } from '@nestjs/mongoose';
|
||||
import { Document } from 'mongoose';
|
||||
|
||||
export type SitesDocument = Sites & Document;
|
||||
|
||||
@Schema()
|
||||
export class Sites {
|
||||
@Prop({ required: true, unique: true})
|
||||
domain: string;
|
||||
|
||||
@Prop()
|
||||
cssSheetsLocation: string[];
|
||||
|
||||
@Prop()
|
||||
scriptsSheetsLocation: string[];
|
||||
|
||||
@Prop({ required: true})
|
||||
urlsList: string[];
|
||||
}
|
||||
|
||||
export const SitesSchema = SchemaFactory.createForClass(Sites);
|
Loading…
Reference in a new issue