Compare commits

..

33 commits

Author SHA1 Message Date
0a053ba249 fixing MONGO BUG 2023-04-20 19:49:46 +03:00
e04a9dd3b1 pushing for testing mongoDB 2023-04-20 18:52:55 +03:00
f24234d10e consistency 2023-04-19 15:14:47 +03:00
8b8e79d159 done 2023-04-19 15:03:33 +03:00
43b598689c work version , still bugs for auth in mongo, changing vesrion to 5.0.15 2023-04-19 14:54:30 +03:00
75c4f55c01 trying to fix problem 2023-04-19 11:10:15 +03:00
c57a13d8d8 adding API usage to README 2023-04-19 10:53:12 +03:00
15c718dd0a make sure there is an sites_assets 2023-04-19 10:15:19 +03:00
9044d96a55 make sure there is an sites_assets folder 2023-04-19 01:52:22 +03:00
275d22e045 installing mongoose + continuing to rest api 2023-04-19 01:47:05 +03:00
75dc340ec5 adding urlsList to the crawling 2023-04-18 22:00:28 +03:00
9d3e39683c added mongod module + created schema file 2023-04-18 21:31:33 +03:00
4b7d4d9022 convert to es6 2023-04-18 21:07:47 +03:00
acb563b3a0 collection from env file + import nest-mongo 2023-04-18 21:07:34 +03:00
e775a58400 fixing docker-compose 2023-04-18 20:57:18 +03:00
Kfir Dayan
245327de72 fixing docker issue 2023-04-18 17:35:49 +03:00
Kfir Dayan
52a338b8a2 install @nestjs/mongoose mongoose 2023-04-18 15:56:35 +03:00
Kfir Dayan
b3b6cb403f small push 2023-04-18 15:54:49 +03:00
Kfir Dayan
a5799e4b48 passing result to DB service 2023-04-18 13:29:39 +03:00
Kfir Dayan
661d5e9880 db add 2023-04-18 13:23:34 +03:00
Kfir Dayan
6d3ed58526 module + service is DB has added 2023-04-18 12:36:56 +03:00
Kfir Dayan
eab79001ab fixing async/await/Promise issue with finalurls 2023-04-18 12:31:40 +03:00
Kfir Dayan
fb02db9b2f save screenshot in full size page 2023-04-18 12:00:28 +03:00
Kfir Dayan
1376b14e2a pushing for testing 2023-04-18 11:53:24 +03:00
Kfir Dayan
b8965e2e3e fixing bugs 2023-04-18 11:46:57 +03:00
Kfir Dayan
10af68eec7 useless commit 2023-04-18 11:21:47 +03:00
Kfir Dayan
babed7889b reOrder the methods 2023-04-18 11:15:22 +03:00
Kfir Dayan
e4d982891f downloading scriptsheets 2023-04-18 11:10:24 +03:00
Kfir Dayan
7385d5b6ab axios added + downloaded stylesheets 2023-04-18 10:56:09 +03:00
Kfir Dayan
d81f21ee64 wip 2023-04-18 10:51:46 +03:00
Kfir Dayan
daa1f363c8 README.md 2023-04-17 19:51:48 +03:00
Kfir Dayan
c5d5a6ac72 README.md 2023-04-17 19:51:00 +03:00
Kfir Dayan
321a525e07 no need spec 2023-04-17 19:47:01 +03:00
16 changed files with 2598 additions and 735 deletions

1
.dockerignore Normal file
View file

@ -0,0 +1 @@
node_modules

6
.env.example Normal file
View file

@ -0,0 +1,6 @@
## DB [MongoDb](https://www.mongodb.com/) ##
MONGO_USERNAME=akamai
MONGO_PASSWORD=password
MONGO_DATABASE=mydatabase
MONGO_HOST=mongo
MONGO_PORT=27017

2
.gitignore vendored
View file

@ -4,7 +4,7 @@
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
#poppeteer's output
crawler_assets/**/*
sites_assets/**/*
# User-specific stuff:

27
Dockerfile Normal file
View file

@ -0,0 +1,27 @@
FROM node:14-alpine AS base
# Set non-interactive mode
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_ENV=production
# Install chromium
RUN apk add --no-cache chromium chromium-chromedriver
# Create a symbolic link for google-chrome
RUN ln -s /usr/bin/chromium-browser /usr/bin/google-chrome
# Set the working directory
WORKDIR /app
# Copy the package.json and package-lock.json files
COPY package*.json ./
# Install dependencies
RUN npm install -g npm@9.6.3 && \
npm install
# Copy the source code
COPY . .
# Start the application
CMD ["npm", "run", "start"]

View file

@ -1 +1,34 @@
# Crawing && Serving #
# Crawing & Serving #
The crawler is a simple crawler that crawls the web and stores the results in a database and assets in a file system. The server is a simple server that serves the results of the crawler.
## Crawler ##
### Usage ###
Post a JSON object to the crawler with the following format:
`domain.com/crawl`
{
"url": "http://www.example.com",
}
The crawler will then crawl the given url and store the results in a database and assets in a file system
`crawler_assests/www.example.com/`.
# API #
The API is a simple API that serves the results of the crawler.
# Routes #
## GET ##
/sites - Returns a list of all sites
/sites/:id - Returns the site object for the given site Id
/sites/domain/:domain - Returns the domain object for the given domain
## DELETE ##
/sites/:id - Deletes the site object for the given site Id
/sites/domain/:domain - Deletes the domain object for the given domain
## Post ##
/sites/:id - Updates the site object for the given site Id

32
docker-compose.yaml Normal file
View file

@ -0,0 +1,32 @@
version: '3'
services:
web:
build: .
ports:
- '3000:3000'
depends_on:
- mongo
environment:
- MONGO_USERNAME=${MONGO_USERNAME}
- MONGO_PASSWORD=${MONGO_PASSWORD}
- MONGO_HOST=${MONGO_HOST}
- MONGO_PORT=${MONGO_PORT}
- MONGO_DATABASE=${MONGO_DATABASE}
networks:
- appnet
mongo:
image: mongo
environment:
- MONGO_INITDB_DATABASE=${MONGO_DATABASE}
- MONGO_INITDB_ROOT_USERNAME=${MONGO_USERNAME}
- MONGO_INITDB_ROOT_PASSWORD=${MONGO_PASSWORD}
volumes:
- new:/data/db
networks:
- appnet
networks:
appnet:
driver: bridge
volumes:
new:
driver: local

2987
package-lock.json generated

File diff suppressed because it is too large Load diff

View file

@ -21,13 +21,18 @@
"dependencies": {
"@nestjs/common": "^9.0.0",
"@nestjs/core": "^9.0.5",
"@nestjs/mongoose": "^9.2.2",
"@nestjs/platform-express": "^9.0.0",
"axios": "^1.3.5",
"dotenv": "^16.0.3",
"minimatch": "^9.0.0",
"mongoose": "^7.0.4",
"puppeteer": "^19.9.1",
"reflect-metadata": "^0.1.13",
"@nestjs/cli": "^9.0.0",
"rxjs": "^7.5.5"
},
"devDependencies": {
"@nestjs/cli": "^9.0.0",
"@nestjs/schematics": "^9.0.0",
"@nestjs/testing": "^9.0.0",
"@types/express": "^4.17.13",

View file

@ -1,18 +0,0 @@
import { Test, TestingModule } from '@nestjs/testing';
import { ApiController } from './api.controller';
describe('ApiController', () => {
let controller: ApiController;
beforeEach(async () => {
const module: TestingModule = await Test.createTestingModule({
controllers: [ApiController],
}).compile();
controller = module.get<ApiController>(ApiController);
});
it('should be defined', () => {
expect(controller).toBeDefined();
});
});

View file

@ -1,11 +1,68 @@
import { Body, Controller, Post } from '@nestjs/common';
import { Body, Controller, Delete, Get, Param, Post } from '@nestjs/common';
import { CrawlerService } from '../crawler/crawler.service';
import { InjectModel } from '@nestjs/mongoose';
import { Model } from 'mongoose';
import { Site } from 'src/interfaces/site.interface';
@Controller('/')
export class ApiController {
constructor(private crawlerService: CrawlerService) {}
constructor(private crawlerService: CrawlerService, @InjectModel('Site') private readonly siteModel: Model<Site> ) {}
@Post('crawl')
async crawl(@Body() body: { url: string }) {
return this.crawlerService.crawl(body.url);
const results = this.crawlerService.crawl(body.url);
results.then((data) => {
console.log("Done crawling !", data);
const newSite = new this.siteModel(data);
newSite.save().then((result) => {
console.log("Site saved !", result);
}).catch((err) => {
console.log("Error saving site !", err.message);
});
}).catch((err) => {
console.log("** Error crawling ! **", err);
console.log(err);
});
return {
message: 'Got your request for ' + body.url
}
}
// Get all
@Get('sites')
async getSites() {
const sites = await this.siteModel.find().exec();
return sites || {};
}
// Get by id
@Get('sites/:id')
async getSite(@Param('id') id: string) {
const site = await this.siteModel.findById(id).exec();
return site || {};
}
// Get by domain
@Get('sites/domain/:domain')
async getSiteByDomain(@Param('domain') domain: string) {
const site = await this.siteModel.findOne({ domain }).exec();
return site || {};
}
// Delete by domain
@Delete('sites/domain/:domain')
async deleteSiteByDomain(@Param('domain') domain: string) {
const site = await this.siteModel.findOneAndDelete({ domain }).exec();
return site || {};
}
// Delete by id
@Delete('sites/:id')
async deleteSite(@Param('id') id: string) {
const site = await this.siteModel.findByIdAndDelete(id).exec();
return site || {};
}
}

View file

@ -1,8 +1,11 @@
import { Module } from '@nestjs/common';
import { ApiController } from './api.controller';
import { CrawlerService } from '../crawler/crawler.service';
import { SitesSchema } from '../schema/sites.schema';
import { MongooseModule } from '@nestjs/mongoose';
@Module({
imports: [MongooseModule.forFeature([{ name: 'Site', schema: SitesSchema }])],
controllers: [ApiController],
providers: [CrawlerService]

View file

@ -1,8 +1,20 @@
import { Module } from '@nestjs/common';
import { MongooseModule } from '@nestjs/mongoose';
import { ApiModule } from './api/api.module';
import { CrawlerModule } from './crawler/crawler.module';
const dotenv = require('dotenv');
dotenv.config();
console.log("mongodb://" + process.env.MONGO_USERNAME + ":" + process.env.MONGO_PASSWORD + "@" + process.env.MONGO_HOST + ":" + process.env.MONGO_PORT);
@Module({
imports: [ApiModule, CrawlerModule]
imports: [
MongooseModule.forRoot("mongodb://" + process.env.MONGO_USERNAME + ":" + process.env.MONGO_PASSWORD + "@" + process.env.MONGO_HOST + ":" + process.env.MONGO_PORT, {
useNewUrlParser: true,
useUnifiedTopology: true,
}),
ApiModule,
CrawlerModule
]
})
export class AppModule { }

View file

@ -1,18 +0,0 @@
import { Test, TestingModule } from '@nestjs/testing';
import { CrawlerService } from './crawler.service';
describe('CrawlerService', () => {
let service: CrawlerService;
beforeEach(async () => {
const module: TestingModule = await Test.createTestingModule({
providers: [CrawlerService],
}).compile();
service = module.get<CrawlerService>(CrawlerService);
});
it('should be defined', () => {
expect(service).toBeDefined();
});
});

View file

@ -3,25 +3,102 @@ import { mkdirSync } from 'fs';
import * as fs from 'fs';
import puppeteer from 'puppeteer';
import { URL } from 'url';
import axios from 'axios';
import { Site } from '../interfaces/site.interface';
@Injectable()
export class CrawlerService {
async crawl(url: string){
const browser = await puppeteer.launch();
constructor() {
if (!fs.existsSync('sites_assets')) {
mkdirSync('sites_assets');
}
}
async crawl(url: string): Promise<Site> {
console.log("start crawl website", url);
const browser = await puppeteer.launch({ executablePath: '/usr/bin/chromium-browser', headless: true, args: ['--no-sandbox'] });
const page = await browser.newPage();
const domain = this.extractDomain(url);
await page.goto(url);
const directory = `crawler_assets/${this.extractDomain(url)}/`;
console.log(directory)
const directory = `sites_assets/${domain}/`;
if (!fs.existsSync(directory)) {
mkdirSync(directory);
}
await page.pdf({path: `${directory}/page.pdf`, format: 'A4'});
await page.screenshot({path: `${directory}/screenshot.png`});
// STYLESHEETS //
console.log("start stylesheets")
const stylesheetsUrls = await page.$$eval('link[rel="stylesheet"]', links => links.map(link => link.href));
let cssDir = `${directory}/css/`
const cssSheetsLocation = await this.downloadFiles(stylesheetsUrls, cssDir);
console.log(`cssSheetsLocation: `, cssSheetsLocation);
// STYLESHEETS //
// SCRIPTS //
const scriptsUrls = await page.$$eval('script', scripts => scripts.map(script => script.src));
let scriptsDir = `${directory}/scripts/`
const scriptsSheetsLocation = await this.downloadFiles(scriptsUrls, scriptsDir);
console.log(`scriptsSheetsLocation: `, scriptsSheetsLocation)
// SCRIPTS //
// SCREENSHOT //
const screenshotBuffer: Buffer = await page.screenshot({ fullPage: true });
await new Promise((resolve, reject) => {
fs.writeFile(`${directory}screenshot.png`, screenshotBuffer, (err) => {
if (err) {
reject(err);
} else {
resolve(true);
}
});
});
// SCREENSHOT //
// URLS //
const urls = await page.$$eval('a', links => links.map(link => link.href));
const urlsList = urls.filter((url) => url.startsWith('http'));
console.log(urlsList);
// URLS //
await browser.close();
return {
domain,
cssSheetsLocation,
scriptsSheetsLocation,
urlsList
}
}
extractDomain(urlString) {
async downloadFiles(urls: string[], path: string) {
const finalUrls: string[] = [];
if (!fs.existsSync(path)) {
mkdirSync(path);
}
await Promise.all(
urls.map(async (url) => {
if (!url.startsWith('http')) return;
const response = await axios.get(url);
const content = response.data;
// trim / from end of url string
let fileLocation = url.replace(/\/$/, "");
// get last part of url
fileLocation = fileLocation.substring(fileLocation.lastIndexOf('/') + 1);
// save file
if (fileLocation.length > 10) {
fileLocation = fileLocation.substring(0, 10);
}
finalUrls.push(`${path}${fileLocation}`);
console.log(`Saving file ${path}${fileLocation}`);
fs.writeFileSync(`${path}${fileLocation}`, content);
})
);
return finalUrls;
}
extractDomain(urlString: string) {
const url = new URL(urlString);
return url.hostname;
}

View file

@ -0,0 +1,6 @@
export interface Site {
domain: string;
cssSheetsLocation: string[];
scriptsSheetsLocation: string[];
urlsList: string[];
}

View file

@ -0,0 +1,21 @@
import { Prop, Schema, SchemaFactory } from '@nestjs/mongoose';
import { Document } from 'mongoose';
export type SitesDocument = Sites & Document;
@Schema()
export class Sites {
@Prop({ required: true, unique: true})
domain: string;
@Prop()
cssSheetsLocation: string[];
@Prop()
scriptsSheetsLocation: string[];
@Prop({ required: true})
urlsList: string[];
}
export const SitesSchema = SchemaFactory.createForClass(Sites);