README.md has added

This commit is contained in:
Kfir Dayan 2023-05-31 13:17:19 +03:00
parent 6a7eb59295
commit 9f23246458
3 changed files with 70 additions and 38 deletions

View file

@ -1,8 +1,55 @@
Run the server - php artisan serve # Web Crawler API
run mongo - run docker-compose up -d The Web Crawler API is a simple API that allows you to crawl websites and store the crawled data in a database. It uses GuzzleHttp to send HTTP requests and parses the HTML content to extract links from web pages. The API is built with Laravel framework.
migrate - php artisan migrate ## Features
- Crawls websites and stores the crawled data in the database.
- Supports setting the depth of the crawling process.
- Prevents duplicate URLs from being crawled.
- Retrieves and saves the HTML content of crawled pages.
- Extracts valid URLs from the crawled pages.
## Prerequisites
- PHP >= 7.4
- Composer
- Laravel framework
- MongoDB
- Docker
- Docker Compose
- GuzzleHttp
- MongoDB PHP driver (extension - mongodb.so)
- jenssegers/mongodb package
## Getting Started
1. Clone the repository:
```bash
git clone <repository-url>
## Services
# server
Run the server - php artisan serve
# MongoDB
Run the server - php artisan serve
run mongo - run docker-compose up -d
migrate - php artisan migrate
## Configuration
use .env file to set up the database connection use .env file to set up the database connection
## API Endpoints ##
GET /api/crawl: Crawls a website and stores the crawled data in the database. Required query parameter: url. Optional query parameter: depth (default: 1).
GET /api: Retrieves all crawled data from the database.
DELETE /api/crawl/{id}: Deletes a specific crawled data record from the database.
DELETE /api/crawl: Deletes all crawled data records from the database.

View file

@ -20,10 +20,11 @@ public function crawlWebsite(Request $request)
{ {
$url = $request->query('url'); $url = $request->query('url');
$depth = $request->query('depth', 1); $depth = $request->query('depth', 1);
$refresh = $request->query('refresh', false);
// Check if the URL is already in the database // Check if the URL is already in the database
$webCrawl = WebCrawl::where('url', $url)->first(); $webCrawl = WebCrawl::where('url', $url)->first();
if ($webCrawl) { if ($webCrawl && !$refresh) {
return response()->json([ return response()->json([
'error' => 'This URL already exists in the database', 'error' => 'This URL already exists in the database',
], 400); ], 400);
@ -34,7 +35,7 @@ public function crawlWebsite(Request $request)
// Use GuzzleHttp client to send HTTP requests // Use GuzzleHttp client to send HTTP requests
$response = $this->scan($url); $response = $this->scan($url);
if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) { if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) {
$crawler = new WebCrawl(); $crawler = $webCrawl ?? new WebCrawl();
$crawler->url = $url; $crawler->url = $url;
$crawler->content = $response->getBody()->getContents(); $crawler->content = $response->getBody()->getContents();
$linksFromPage = $this->getLinksFromPage($crawler->content); $linksFromPage = $this->getLinksFromPage($crawler->content);
@ -46,9 +47,14 @@ public function crawlWebsite(Request $request)
], 500); ], 500);
} }
if ($depth > 0 && count($linksFromPage) > 0) { if ($depth > 0 && count($linksFromPage) > 0) {
$results = [];
foreach ($linksFromPage as $link) { foreach ($linksFromPage as $link) {
$this->crawlWebsiteRecursive($link, $depth - 1); $results[] = $this->crawlWebsiteRecursive($link, $depth - 1);
} }
return response()->json([
'message' => 'Crawling completed successfully',
'data' => $url
]);
} }
} else { } else {
return response()->json([ return response()->json([
@ -66,7 +72,7 @@ protected function crawlWebsiteRecursive($url, $depth)
// Check if the URL is already in the database // Check if the URL is already in the database
$webCrawl = WebCrawl::where('url', $url)->first(); $webCrawl = WebCrawl::where('url', $url)->first();
if ($webCrawl) { if ($webCrawl) {
return; return [];
} }
// Use GuzzleHttp client to send HTTP requests // Use GuzzleHttp client to send HTTP requests
@ -79,14 +85,18 @@ protected function crawlWebsiteRecursive($url, $depth)
try { try {
$crawler->save(); $crawler->save();
} catch (\Exception $e) { } catch (\Exception $e) {
return; return [];
} }
if ($depth > 0 && count($linksFromPage) > 0) { if ($depth > 0 && count($linksFromPage) > 0) {
$results = [];
foreach ($linksFromPage as $link) { foreach ($linksFromPage as $link) {
$this->crawlWebsiteRecursive($link, $depth - 1); $results = array_merge($results, $this->crawlWebsiteRecursive($link, $depth - 1));
} }
return $results;
} }
} }
return [];
} }
protected function getLinksFromPage($crawlerContent) protected function getLinksFromPage($crawlerContent)

View file

@ -4,32 +4,7 @@
use Illuminate\Support\Facades\Route; use Illuminate\Support\Facades\Route;
use App\Http\Controllers\WebCrawlController; use App\Http\Controllers\WebCrawlController;
Route::get('/crawl', function (Request $request) { Route::get('/crawl', [WebCrawlController::class, 'crawlWebsite']);
$url = $request->query('url'); Route::get('/', [WebCrawlController::class, 'index']);
Route::delete('/crawl/{id}', [WebCrawlController::class, 'destroy']);
if (!$url || !filter_var($url, FILTER_VALIDATE_URL)) { Route::delete('/crawl', [WebCrawlController::class, 'destroyAll']);
return response()->json([
'error' => 'Missing required parameter `url` or invalid URL',
], 400);
}
$depth = $request->query('depth', 3);
$crawlerController = new WebCrawlController();
return $crawlerController->crawlWebsite($request);
});
Route::get('/', function () {
$crawlerController = new WebCrawlController();
return $crawlerController->index();
});
Route::delete('/crawl/{id}', function ($id) {
$crawlerController = new WebCrawlController();
return $crawlerController->destroy($id);
});
Route::delete('/crawl', function () {
$crawlerController = new WebCrawlController();
return $crawlerController->destroyAll();
});