Compare commits
33 commits
implement_
...
master
Author | SHA1 | Date | |
---|---|---|---|
0a053ba249 | |||
e04a9dd3b1 | |||
f24234d10e | |||
8b8e79d159 | |||
43b598689c | |||
75c4f55c01 | |||
c57a13d8d8 | |||
15c718dd0a | |||
9044d96a55 | |||
275d22e045 | |||
75dc340ec5 | |||
9d3e39683c | |||
4b7d4d9022 | |||
acb563b3a0 | |||
e775a58400 | |||
|
245327de72 | ||
|
52a338b8a2 | ||
|
b3b6cb403f | ||
|
a5799e4b48 | ||
|
661d5e9880 | ||
|
6d3ed58526 | ||
|
eab79001ab | ||
|
fb02db9b2f | ||
|
1376b14e2a | ||
|
b8965e2e3e | ||
|
10af68eec7 | ||
|
babed7889b | ||
|
e4d982891f | ||
|
7385d5b6ab | ||
|
d81f21ee64 | ||
|
daa1f363c8 | ||
|
c5d5a6ac72 | ||
|
321a525e07 |
16 changed files with 2598 additions and 735 deletions
1
.dockerignore
Normal file
1
.dockerignore
Normal file
|
@ -0,0 +1 @@
|
||||||
|
node_modules
|
6
.env.example
Normal file
6
.env.example
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
## DB [MongoDb](https://www.mongodb.com/) ##
|
||||||
|
MONGO_USERNAME=akamai
|
||||||
|
MONGO_PASSWORD=password
|
||||||
|
MONGO_DATABASE=mydatabase
|
||||||
|
MONGO_HOST=mongo
|
||||||
|
MONGO_PORT=27017
|
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -4,7 +4,7 @@
|
||||||
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
|
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
|
||||||
|
|
||||||
#poppeteer's output
|
#poppeteer's output
|
||||||
crawler_assets/**/*
|
sites_assets/**/*
|
||||||
|
|
||||||
|
|
||||||
# User-specific stuff:
|
# User-specific stuff:
|
||||||
|
|
27
Dockerfile
Normal file
27
Dockerfile
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
FROM node:14-alpine AS base
|
||||||
|
|
||||||
|
# Set non-interactive mode
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
ENV NODE_ENV=production
|
||||||
|
|
||||||
|
# Install chromium
|
||||||
|
RUN apk add --no-cache chromium chromium-chromedriver
|
||||||
|
|
||||||
|
# Create a symbolic link for google-chrome
|
||||||
|
RUN ln -s /usr/bin/chromium-browser /usr/bin/google-chrome
|
||||||
|
|
||||||
|
# Set the working directory
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Copy the package.json and package-lock.json files
|
||||||
|
COPY package*.json ./
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
RUN npm install -g npm@9.6.3 && \
|
||||||
|
npm install
|
||||||
|
|
||||||
|
# Copy the source code
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Start the application
|
||||||
|
CMD ["npm", "run", "start"]
|
35
README.md
35
README.md
|
@ -1 +1,34 @@
|
||||||
# Crawing && Serving #
|
# Crawing & Serving #
|
||||||
|
|
||||||
|
The crawler is a simple crawler that crawls the web and stores the results in a database and assets in a file system. The server is a simple server that serves the results of the crawler.
|
||||||
|
|
||||||
|
|
||||||
|
## Crawler ##
|
||||||
|
|
||||||
|
|
||||||
|
### Usage ###
|
||||||
|
Post a JSON object to the crawler with the following format:
|
||||||
|
|
||||||
|
`domain.com/crawl`
|
||||||
|
{
|
||||||
|
"url": "http://www.example.com",
|
||||||
|
}
|
||||||
|
|
||||||
|
The crawler will then crawl the given url and store the results in a database and assets in a file system
|
||||||
|
`crawler_assests/www.example.com/`.
|
||||||
|
|
||||||
|
|
||||||
|
# API #
|
||||||
|
|
||||||
|
The API is a simple API that serves the results of the crawler.
|
||||||
|
|
||||||
|
# Routes #
|
||||||
|
## GET ##
|
||||||
|
/sites - Returns a list of all sites
|
||||||
|
/sites/:id - Returns the site object for the given site Id
|
||||||
|
/sites/domain/:domain - Returns the domain object for the given domain
|
||||||
|
## DELETE ##
|
||||||
|
/sites/:id - Deletes the site object for the given site Id
|
||||||
|
/sites/domain/:domain - Deletes the domain object for the given domain
|
||||||
|
## Post ##
|
||||||
|
/sites/:id - Updates the site object for the given site Id
|
32
docker-compose.yaml
Normal file
32
docker-compose.yaml
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
version: '3'
|
||||||
|
services:
|
||||||
|
web:
|
||||||
|
build: .
|
||||||
|
ports:
|
||||||
|
- '3000:3000'
|
||||||
|
depends_on:
|
||||||
|
- mongo
|
||||||
|
environment:
|
||||||
|
- MONGO_USERNAME=${MONGO_USERNAME}
|
||||||
|
- MONGO_PASSWORD=${MONGO_PASSWORD}
|
||||||
|
- MONGO_HOST=${MONGO_HOST}
|
||||||
|
- MONGO_PORT=${MONGO_PORT}
|
||||||
|
- MONGO_DATABASE=${MONGO_DATABASE}
|
||||||
|
networks:
|
||||||
|
- appnet
|
||||||
|
mongo:
|
||||||
|
image: mongo
|
||||||
|
environment:
|
||||||
|
- MONGO_INITDB_DATABASE=${MONGO_DATABASE}
|
||||||
|
- MONGO_INITDB_ROOT_USERNAME=${MONGO_USERNAME}
|
||||||
|
- MONGO_INITDB_ROOT_PASSWORD=${MONGO_PASSWORD}
|
||||||
|
volumes:
|
||||||
|
- new:/data/db
|
||||||
|
networks:
|
||||||
|
- appnet
|
||||||
|
networks:
|
||||||
|
appnet:
|
||||||
|
driver: bridge
|
||||||
|
volumes:
|
||||||
|
new:
|
||||||
|
driver: local
|
2987
package-lock.json
generated
2987
package-lock.json
generated
File diff suppressed because it is too large
Load diff
|
@ -21,13 +21,18 @@
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@nestjs/common": "^9.0.0",
|
"@nestjs/common": "^9.0.0",
|
||||||
"@nestjs/core": "^9.0.5",
|
"@nestjs/core": "^9.0.5",
|
||||||
|
"@nestjs/mongoose": "^9.2.2",
|
||||||
"@nestjs/platform-express": "^9.0.0",
|
"@nestjs/platform-express": "^9.0.0",
|
||||||
|
"axios": "^1.3.5",
|
||||||
|
"dotenv": "^16.0.3",
|
||||||
|
"minimatch": "^9.0.0",
|
||||||
|
"mongoose": "^7.0.4",
|
||||||
"puppeteer": "^19.9.1",
|
"puppeteer": "^19.9.1",
|
||||||
"reflect-metadata": "^0.1.13",
|
"reflect-metadata": "^0.1.13",
|
||||||
|
"@nestjs/cli": "^9.0.0",
|
||||||
"rxjs": "^7.5.5"
|
"rxjs": "^7.5.5"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@nestjs/cli": "^9.0.0",
|
|
||||||
"@nestjs/schematics": "^9.0.0",
|
"@nestjs/schematics": "^9.0.0",
|
||||||
"@nestjs/testing": "^9.0.0",
|
"@nestjs/testing": "^9.0.0",
|
||||||
"@types/express": "^4.17.13",
|
"@types/express": "^4.17.13",
|
||||||
|
|
|
@ -1,18 +0,0 @@
|
||||||
import { Test, TestingModule } from '@nestjs/testing';
|
|
||||||
import { ApiController } from './api.controller';
|
|
||||||
|
|
||||||
describe('ApiController', () => {
|
|
||||||
let controller: ApiController;
|
|
||||||
|
|
||||||
beforeEach(async () => {
|
|
||||||
const module: TestingModule = await Test.createTestingModule({
|
|
||||||
controllers: [ApiController],
|
|
||||||
}).compile();
|
|
||||||
|
|
||||||
controller = module.get<ApiController>(ApiController);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('should be defined', () => {
|
|
||||||
expect(controller).toBeDefined();
|
|
||||||
});
|
|
||||||
});
|
|
|
@ -1,11 +1,68 @@
|
||||||
import { Body, Controller, Post } from '@nestjs/common';
|
import { Body, Controller, Delete, Get, Param, Post } from '@nestjs/common';
|
||||||
import { CrawlerService } from '../crawler/crawler.service';
|
import { CrawlerService } from '../crawler/crawler.service';
|
||||||
|
import { InjectModel } from '@nestjs/mongoose';
|
||||||
|
import { Model } from 'mongoose';
|
||||||
|
import { Site } from 'src/interfaces/site.interface';
|
||||||
|
|
||||||
@Controller('/')
|
@Controller('/')
|
||||||
export class ApiController {
|
export class ApiController {
|
||||||
constructor(private crawlerService: CrawlerService) {}
|
|
||||||
|
constructor(private crawlerService: CrawlerService, @InjectModel('Site') private readonly siteModel: Model<Site> ) {}
|
||||||
|
|
||||||
@Post('crawl')
|
@Post('crawl')
|
||||||
async crawl(@Body() body: { url: string }) {
|
async crawl(@Body() body: { url: string }) {
|
||||||
return this.crawlerService.crawl(body.url);
|
|
||||||
|
const results = this.crawlerService.crawl(body.url);
|
||||||
|
results.then((data) => {
|
||||||
|
console.log("Done crawling !", data);
|
||||||
|
const newSite = new this.siteModel(data);
|
||||||
|
newSite.save().then((result) => {
|
||||||
|
console.log("Site saved !", result);
|
||||||
|
}).catch((err) => {
|
||||||
|
console.log("Error saving site !", err.message);
|
||||||
|
});
|
||||||
|
}).catch((err) => {
|
||||||
|
console.log("** Error crawling ! **", err);
|
||||||
|
console.log(err);
|
||||||
|
});
|
||||||
|
return {
|
||||||
|
message: 'Got your request for ' + body.url
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Get all
|
||||||
|
@Get('sites')
|
||||||
|
async getSites() {
|
||||||
|
const sites = await this.siteModel.find().exec();
|
||||||
|
return sites || {};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get by id
|
||||||
|
@Get('sites/:id')
|
||||||
|
async getSite(@Param('id') id: string) {
|
||||||
|
const site = await this.siteModel.findById(id).exec();
|
||||||
|
return site || {};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get by domain
|
||||||
|
@Get('sites/domain/:domain')
|
||||||
|
async getSiteByDomain(@Param('domain') domain: string) {
|
||||||
|
const site = await this.siteModel.findOne({ domain }).exec();
|
||||||
|
return site || {};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delete by domain
|
||||||
|
@Delete('sites/domain/:domain')
|
||||||
|
async deleteSiteByDomain(@Param('domain') domain: string) {
|
||||||
|
const site = await this.siteModel.findOneAndDelete({ domain }).exec();
|
||||||
|
return site || {};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delete by id
|
||||||
|
@Delete('sites/:id')
|
||||||
|
async deleteSite(@Param('id') id: string) {
|
||||||
|
const site = await this.siteModel.findByIdAndDelete(id).exec();
|
||||||
|
return site || {};
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
|
@ -1,8 +1,11 @@
|
||||||
import { Module } from '@nestjs/common';
|
import { Module } from '@nestjs/common';
|
||||||
import { ApiController } from './api.controller';
|
import { ApiController } from './api.controller';
|
||||||
import { CrawlerService } from '../crawler/crawler.service';
|
import { CrawlerService } from '../crawler/crawler.service';
|
||||||
|
import { SitesSchema } from '../schema/sites.schema';
|
||||||
|
import { MongooseModule } from '@nestjs/mongoose';
|
||||||
|
|
||||||
@Module({
|
@Module({
|
||||||
|
imports: [MongooseModule.forFeature([{ name: 'Site', schema: SitesSchema }])],
|
||||||
controllers: [ApiController],
|
controllers: [ApiController],
|
||||||
providers: [CrawlerService]
|
providers: [CrawlerService]
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,20 @@
|
||||||
import { Module } from '@nestjs/common';
|
import { Module } from '@nestjs/common';
|
||||||
|
import { MongooseModule } from '@nestjs/mongoose';
|
||||||
import { ApiModule } from './api/api.module';
|
import { ApiModule } from './api/api.module';
|
||||||
import { CrawlerModule } from './crawler/crawler.module';
|
import { CrawlerModule } from './crawler/crawler.module';
|
||||||
|
const dotenv = require('dotenv');
|
||||||
|
|
||||||
|
dotenv.config();
|
||||||
|
console.log("mongodb://" + process.env.MONGO_USERNAME + ":" + process.env.MONGO_PASSWORD + "@" + process.env.MONGO_HOST + ":" + process.env.MONGO_PORT);
|
||||||
@Module({
|
@Module({
|
||||||
imports: [ApiModule, CrawlerModule]
|
imports: [
|
||||||
|
MongooseModule.forRoot("mongodb://" + process.env.MONGO_USERNAME + ":" + process.env.MONGO_PASSWORD + "@" + process.env.MONGO_HOST + ":" + process.env.MONGO_PORT, {
|
||||||
|
useNewUrlParser: true,
|
||||||
|
useUnifiedTopology: true,
|
||||||
|
}),
|
||||||
|
ApiModule,
|
||||||
|
CrawlerModule
|
||||||
|
]
|
||||||
|
|
||||||
})
|
})
|
||||||
export class AppModule { }
|
export class AppModule { }
|
||||||
|
|
|
@ -1,18 +0,0 @@
|
||||||
import { Test, TestingModule } from '@nestjs/testing';
|
|
||||||
import { CrawlerService } from './crawler.service';
|
|
||||||
|
|
||||||
describe('CrawlerService', () => {
|
|
||||||
let service: CrawlerService;
|
|
||||||
|
|
||||||
beforeEach(async () => {
|
|
||||||
const module: TestingModule = await Test.createTestingModule({
|
|
||||||
providers: [CrawlerService],
|
|
||||||
}).compile();
|
|
||||||
|
|
||||||
service = module.get<CrawlerService>(CrawlerService);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('should be defined', () => {
|
|
||||||
expect(service).toBeDefined();
|
|
||||||
});
|
|
||||||
});
|
|
|
@ -3,25 +3,102 @@ import { mkdirSync } from 'fs';
|
||||||
import * as fs from 'fs';
|
import * as fs from 'fs';
|
||||||
import puppeteer from 'puppeteer';
|
import puppeteer from 'puppeteer';
|
||||||
import { URL } from 'url';
|
import { URL } from 'url';
|
||||||
|
import axios from 'axios';
|
||||||
|
import { Site } from '../interfaces/site.interface';
|
||||||
|
|
||||||
@Injectable()
|
@Injectable()
|
||||||
export class CrawlerService {
|
export class CrawlerService {
|
||||||
|
|
||||||
async crawl(url: string){
|
constructor() {
|
||||||
const browser = await puppeteer.launch();
|
if (!fs.existsSync('sites_assets')) {
|
||||||
|
mkdirSync('sites_assets');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async crawl(url: string): Promise<Site> {
|
||||||
|
|
||||||
|
console.log("start crawl website", url);
|
||||||
|
const browser = await puppeteer.launch({ executablePath: '/usr/bin/chromium-browser', headless: true, args: ['--no-sandbox'] });
|
||||||
const page = await browser.newPage();
|
const page = await browser.newPage();
|
||||||
|
const domain = this.extractDomain(url);
|
||||||
await page.goto(url);
|
await page.goto(url);
|
||||||
const directory = `crawler_assets/${this.extractDomain(url)}/`;
|
const directory = `sites_assets/${domain}/`;
|
||||||
console.log(directory)
|
|
||||||
if (!fs.existsSync(directory)) {
|
if (!fs.existsSync(directory)) {
|
||||||
mkdirSync(directory);
|
mkdirSync(directory);
|
||||||
}
|
}
|
||||||
await page.pdf({path: `${directory}/page.pdf`, format: 'A4'});
|
|
||||||
await page.screenshot({path: `${directory}/screenshot.png`});
|
// STYLESHEETS //
|
||||||
|
console.log("start stylesheets")
|
||||||
|
const stylesheetsUrls = await page.$$eval('link[rel="stylesheet"]', links => links.map(link => link.href));
|
||||||
|
let cssDir = `${directory}/css/`
|
||||||
|
const cssSheetsLocation = await this.downloadFiles(stylesheetsUrls, cssDir);
|
||||||
|
console.log(`cssSheetsLocation: `, cssSheetsLocation);
|
||||||
|
// STYLESHEETS //
|
||||||
|
|
||||||
|
// SCRIPTS //
|
||||||
|
const scriptsUrls = await page.$$eval('script', scripts => scripts.map(script => script.src));
|
||||||
|
let scriptsDir = `${directory}/scripts/`
|
||||||
|
const scriptsSheetsLocation = await this.downloadFiles(scriptsUrls, scriptsDir);
|
||||||
|
console.log(`scriptsSheetsLocation: `, scriptsSheetsLocation)
|
||||||
|
// SCRIPTS //
|
||||||
|
|
||||||
|
// SCREENSHOT //
|
||||||
|
const screenshotBuffer: Buffer = await page.screenshot({ fullPage: true });
|
||||||
|
await new Promise((resolve, reject) => {
|
||||||
|
fs.writeFile(`${directory}screenshot.png`, screenshotBuffer, (err) => {
|
||||||
|
if (err) {
|
||||||
|
reject(err);
|
||||||
|
} else {
|
||||||
|
resolve(true);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
// SCREENSHOT //
|
||||||
|
|
||||||
|
// URLS //
|
||||||
|
const urls = await page.$$eval('a', links => links.map(link => link.href));
|
||||||
|
const urlsList = urls.filter((url) => url.startsWith('http'));
|
||||||
|
console.log(urlsList);
|
||||||
|
// URLS //
|
||||||
|
|
||||||
await browser.close();
|
await browser.close();
|
||||||
|
return {
|
||||||
|
domain,
|
||||||
|
cssSheetsLocation,
|
||||||
|
scriptsSheetsLocation,
|
||||||
|
urlsList
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extractDomain(urlString) {
|
async downloadFiles(urls: string[], path: string) {
|
||||||
|
const finalUrls: string[] = [];
|
||||||
|
if (!fs.existsSync(path)) {
|
||||||
|
mkdirSync(path);
|
||||||
|
}
|
||||||
|
|
||||||
|
await Promise.all(
|
||||||
|
urls.map(async (url) => {
|
||||||
|
if (!url.startsWith('http')) return;
|
||||||
|
const response = await axios.get(url);
|
||||||
|
const content = response.data;
|
||||||
|
// trim / from end of url string
|
||||||
|
let fileLocation = url.replace(/\/$/, "");
|
||||||
|
// get last part of url
|
||||||
|
fileLocation = fileLocation.substring(fileLocation.lastIndexOf('/') + 1);
|
||||||
|
// save file
|
||||||
|
if (fileLocation.length > 10) {
|
||||||
|
fileLocation = fileLocation.substring(0, 10);
|
||||||
|
}
|
||||||
|
finalUrls.push(`${path}${fileLocation}`);
|
||||||
|
console.log(`Saving file ${path}${fileLocation}`);
|
||||||
|
fs.writeFileSync(`${path}${fileLocation}`, content);
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
||||||
|
return finalUrls;
|
||||||
|
}
|
||||||
|
|
||||||
|
extractDomain(urlString: string) {
|
||||||
const url = new URL(urlString);
|
const url = new URL(urlString);
|
||||||
return url.hostname;
|
return url.hostname;
|
||||||
}
|
}
|
||||||
|
|
6
src/interfaces/site.interface.ts
Normal file
6
src/interfaces/site.interface.ts
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
export interface Site {
|
||||||
|
domain: string;
|
||||||
|
cssSheetsLocation: string[];
|
||||||
|
scriptsSheetsLocation: string[];
|
||||||
|
urlsList: string[];
|
||||||
|
}
|
21
src/schema/sites.schema.ts
Normal file
21
src/schema/sites.schema.ts
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
import { Prop, Schema, SchemaFactory } from '@nestjs/mongoose';
|
||||||
|
import { Document } from 'mongoose';
|
||||||
|
|
||||||
|
export type SitesDocument = Sites & Document;
|
||||||
|
|
||||||
|
@Schema()
|
||||||
|
export class Sites {
|
||||||
|
@Prop({ required: true, unique: true})
|
||||||
|
domain: string;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
cssSheetsLocation: string[];
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
scriptsSheetsLocation: string[];
|
||||||
|
|
||||||
|
@Prop({ required: true})
|
||||||
|
urlsList: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export const SitesSchema = SchemaFactory.createForClass(Sites);
|
Loading…
Reference in a new issue