README.md has added
This commit is contained in:
parent
6a7eb59295
commit
9f23246458
3 changed files with 70 additions and 38 deletions
51
README.md
51
README.md
|
@ -1,8 +1,55 @@
|
||||||
|
# Web Crawler API
|
||||||
|
|
||||||
|
The Web Crawler API is a simple API that allows you to crawl websites and store the crawled data in a database. It uses GuzzleHttp to send HTTP requests and parses the HTML content to extract links from web pages. The API is built with Laravel framework.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- Crawls websites and stores the crawled data in the database.
|
||||||
|
- Supports setting the depth of the crawling process.
|
||||||
|
- Prevents duplicate URLs from being crawled.
|
||||||
|
- Retrieves and saves the HTML content of crawled pages.
|
||||||
|
- Extracts valid URLs from the crawled pages.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- PHP >= 7.4
|
||||||
|
- Composer
|
||||||
|
- Laravel framework
|
||||||
|
- MongoDB
|
||||||
|
- Docker
|
||||||
|
- Docker Compose
|
||||||
|
- GuzzleHttp
|
||||||
|
- MongoDB PHP driver (extension - mongodb.so)
|
||||||
|
- jenssegers/mongodb package
|
||||||
|
|
||||||
|
## Getting Started
|
||||||
|
|
||||||
|
1. Clone the repository:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone <repository-url>
|
||||||
|
|
||||||
|
|
||||||
|
## Services
|
||||||
|
# server
|
||||||
|
|
||||||
|
Run the server - php artisan serve
|
||||||
|
# MongoDB
|
||||||
Run the server - php artisan serve
|
Run the server - php artisan serve
|
||||||
|
|
||||||
run mongo - run docker-compose up -d
|
run mongo - run docker-compose up -d
|
||||||
|
|
||||||
migrate - php artisan migrate
|
migrate - php artisan migrate
|
||||||
|
|
||||||
|
|
||||||
|
## Configuration
|
||||||
use .env file to set up the database connection
|
use .env file to set up the database connection
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## API Endpoints ##
|
||||||
|
|
||||||
|
GET /api/crawl: Crawls a website and stores the crawled data in the database. Required query parameter: url. Optional query parameter: depth (default: 1).
|
||||||
|
GET /api: Retrieves all crawled data from the database.
|
||||||
|
DELETE /api/crawl/{id}: Deletes a specific crawled data record from the database.
|
||||||
|
DELETE /api/crawl: Deletes all crawled data records from the database.
|
||||||
|
|
||||||
|
|
|
@ -20,10 +20,11 @@ public function crawlWebsite(Request $request)
|
||||||
{
|
{
|
||||||
$url = $request->query('url');
|
$url = $request->query('url');
|
||||||
$depth = $request->query('depth', 1);
|
$depth = $request->query('depth', 1);
|
||||||
|
$refresh = $request->query('refresh', false);
|
||||||
|
|
||||||
// Check if the URL is already in the database
|
// Check if the URL is already in the database
|
||||||
$webCrawl = WebCrawl::where('url', $url)->first();
|
$webCrawl = WebCrawl::where('url', $url)->first();
|
||||||
if ($webCrawl) {
|
if ($webCrawl && !$refresh) {
|
||||||
return response()->json([
|
return response()->json([
|
||||||
'error' => 'This URL already exists in the database',
|
'error' => 'This URL already exists in the database',
|
||||||
], 400);
|
], 400);
|
||||||
|
@ -34,7 +35,7 @@ public function crawlWebsite(Request $request)
|
||||||
// Use GuzzleHttp client to send HTTP requests
|
// Use GuzzleHttp client to send HTTP requests
|
||||||
$response = $this->scan($url);
|
$response = $this->scan($url);
|
||||||
if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) {
|
if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) {
|
||||||
$crawler = new WebCrawl();
|
$crawler = $webCrawl ?? new WebCrawl();
|
||||||
$crawler->url = $url;
|
$crawler->url = $url;
|
||||||
$crawler->content = $response->getBody()->getContents();
|
$crawler->content = $response->getBody()->getContents();
|
||||||
$linksFromPage = $this->getLinksFromPage($crawler->content);
|
$linksFromPage = $this->getLinksFromPage($crawler->content);
|
||||||
|
@ -46,9 +47,14 @@ public function crawlWebsite(Request $request)
|
||||||
], 500);
|
], 500);
|
||||||
}
|
}
|
||||||
if ($depth > 0 && count($linksFromPage) > 0) {
|
if ($depth > 0 && count($linksFromPage) > 0) {
|
||||||
|
$results = [];
|
||||||
foreach ($linksFromPage as $link) {
|
foreach ($linksFromPage as $link) {
|
||||||
$this->crawlWebsiteRecursive($link, $depth - 1);
|
$results[] = $this->crawlWebsiteRecursive($link, $depth - 1);
|
||||||
}
|
}
|
||||||
|
return response()->json([
|
||||||
|
'message' => 'Crawling completed successfully',
|
||||||
|
'data' => $url
|
||||||
|
]);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
return response()->json([
|
return response()->json([
|
||||||
|
@ -66,7 +72,7 @@ protected function crawlWebsiteRecursive($url, $depth)
|
||||||
// Check if the URL is already in the database
|
// Check if the URL is already in the database
|
||||||
$webCrawl = WebCrawl::where('url', $url)->first();
|
$webCrawl = WebCrawl::where('url', $url)->first();
|
||||||
if ($webCrawl) {
|
if ($webCrawl) {
|
||||||
return;
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Use GuzzleHttp client to send HTTP requests
|
// Use GuzzleHttp client to send HTTP requests
|
||||||
|
@ -79,14 +85,18 @@ protected function crawlWebsiteRecursive($url, $depth)
|
||||||
try {
|
try {
|
||||||
$crawler->save();
|
$crawler->save();
|
||||||
} catch (\Exception $e) {
|
} catch (\Exception $e) {
|
||||||
return;
|
return [];
|
||||||
}
|
}
|
||||||
if ($depth > 0 && count($linksFromPage) > 0) {
|
if ($depth > 0 && count($linksFromPage) > 0) {
|
||||||
|
$results = [];
|
||||||
foreach ($linksFromPage as $link) {
|
foreach ($linksFromPage as $link) {
|
||||||
$this->crawlWebsiteRecursive($link, $depth - 1);
|
$results = array_merge($results, $this->crawlWebsiteRecursive($link, $depth - 1));
|
||||||
}
|
}
|
||||||
|
return $results;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
protected function getLinksFromPage($crawlerContent)
|
protected function getLinksFromPage($crawlerContent)
|
||||||
|
|
|
@ -4,32 +4,7 @@
|
||||||
use Illuminate\Support\Facades\Route;
|
use Illuminate\Support\Facades\Route;
|
||||||
use App\Http\Controllers\WebCrawlController;
|
use App\Http\Controllers\WebCrawlController;
|
||||||
|
|
||||||
Route::get('/crawl', function (Request $request) {
|
Route::get('/crawl', [WebCrawlController::class, 'crawlWebsite']);
|
||||||
$url = $request->query('url');
|
Route::get('/', [WebCrawlController::class, 'index']);
|
||||||
|
Route::delete('/crawl/{id}', [WebCrawlController::class, 'destroy']);
|
||||||
if (!$url || !filter_var($url, FILTER_VALIDATE_URL)) {
|
Route::delete('/crawl', [WebCrawlController::class, 'destroyAll']);
|
||||||
return response()->json([
|
|
||||||
'error' => 'Missing required parameter `url` or invalid URL',
|
|
||||||
], 400);
|
|
||||||
}
|
|
||||||
|
|
||||||
$depth = $request->query('depth', 3);
|
|
||||||
|
|
||||||
$crawlerController = new WebCrawlController();
|
|
||||||
return $crawlerController->crawlWebsite($request);
|
|
||||||
});
|
|
||||||
|
|
||||||
Route::get('/', function () {
|
|
||||||
$crawlerController = new WebCrawlController();
|
|
||||||
return $crawlerController->index();
|
|
||||||
});
|
|
||||||
|
|
||||||
Route::delete('/crawl/{id}', function ($id) {
|
|
||||||
$crawlerController = new WebCrawlController();
|
|
||||||
return $crawlerController->destroy($id);
|
|
||||||
});
|
|
||||||
|
|
||||||
Route::delete('/crawl', function () {
|
|
||||||
$crawlerController = new WebCrawlController();
|
|
||||||
return $crawlerController->destroyAll();
|
|
||||||
});
|
|
||||||
|
|
Loading…
Reference in a new issue