diff --git a/README.md b/README.md index cbdb580..1d23bab 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,55 @@ -Run the server - php artisan serve +# Web Crawler API -run mongo - run docker-compose up -d +The Web Crawler API is a simple API that allows you to crawl websites and store the crawled data in a database. It uses GuzzleHttp to send HTTP requests and parses the HTML content to extract links from web pages. The API is built with Laravel framework. -migrate - php artisan migrate +## Features +- Crawls websites and stores the crawled data in the database. +- Supports setting the depth of the crawling process. +- Prevents duplicate URLs from being crawled. +- Retrieves and saves the HTML content of crawled pages. +- Extracts valid URLs from the crawled pages. + +## Prerequisites + +- PHP >= 7.4 +- Composer +- Laravel framework +- MongoDB +- Docker +- Docker Compose +- GuzzleHttp +- MongoDB PHP driver (extension - mongodb.so) +- jenssegers/mongodb package + +## Getting Started + +1. Clone the repository: + + ```bash + git clone + + +## Services + # server + + Run the server - php artisan serve + # MongoDB + Run the server - php artisan serve + run mongo - run docker-compose up -d + migrate - php artisan migrate + + +## Configuration use .env file to set up the database connection + + + +## API Endpoints ## + + GET /api/crawl: Crawls a website and stores the crawled data in the database. Required query parameter: url. Optional query parameter: depth (default: 1). + GET /api: Retrieves all crawled data from the database. + DELETE /api/crawl/{id}: Deletes a specific crawled data record from the database. + DELETE /api/crawl: Deletes all crawled data records from the database. + diff --git a/app/Http/Controllers/WebCrawlController.php b/app/Http/Controllers/WebCrawlController.php index 602cd5e..f62035b 100644 --- a/app/Http/Controllers/WebCrawlController.php +++ b/app/Http/Controllers/WebCrawlController.php @@ -20,10 +20,11 @@ public function crawlWebsite(Request $request) { $url = $request->query('url'); $depth = $request->query('depth', 1); + $refresh = $request->query('refresh', false); // Check if the URL is already in the database $webCrawl = WebCrawl::where('url', $url)->first(); - if ($webCrawl) { + if ($webCrawl && !$refresh) { return response()->json([ 'error' => 'This URL already exists in the database', ], 400); @@ -34,7 +35,7 @@ public function crawlWebsite(Request $request) // Use GuzzleHttp client to send HTTP requests $response = $this->scan($url); if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) { - $crawler = new WebCrawl(); + $crawler = $webCrawl ?? new WebCrawl(); $crawler->url = $url; $crawler->content = $response->getBody()->getContents(); $linksFromPage = $this->getLinksFromPage($crawler->content); @@ -46,9 +47,14 @@ public function crawlWebsite(Request $request) ], 500); } if ($depth > 0 && count($linksFromPage) > 0) { + $results = []; foreach ($linksFromPage as $link) { - $this->crawlWebsiteRecursive($link, $depth - 1); + $results[] = $this->crawlWebsiteRecursive($link, $depth - 1); } + return response()->json([ + 'message' => 'Crawling completed successfully', + 'data' => $url + ]); } } else { return response()->json([ @@ -66,7 +72,7 @@ protected function crawlWebsiteRecursive($url, $depth) // Check if the URL is already in the database $webCrawl = WebCrawl::where('url', $url)->first(); if ($webCrawl) { - return; + return []; } // Use GuzzleHttp client to send HTTP requests @@ -79,14 +85,18 @@ protected function crawlWebsiteRecursive($url, $depth) try { $crawler->save(); } catch (\Exception $e) { - return; + return []; } if ($depth > 0 && count($linksFromPage) > 0) { + $results = []; foreach ($linksFromPage as $link) { - $this->crawlWebsiteRecursive($link, $depth - 1); + $results = array_merge($results, $this->crawlWebsiteRecursive($link, $depth - 1)); } + return $results; } } + + return []; } protected function getLinksFromPage($crawlerContent) diff --git a/routes/api.php b/routes/api.php index 55bfb08..247e1ae 100644 --- a/routes/api.php +++ b/routes/api.php @@ -4,32 +4,7 @@ use Illuminate\Support\Facades\Route; use App\Http\Controllers\WebCrawlController; -Route::get('/crawl', function (Request $request) { - $url = $request->query('url'); - - if (!$url || !filter_var($url, FILTER_VALIDATE_URL)) { - return response()->json([ - 'error' => 'Missing required parameter `url` or invalid URL', - ], 400); - } - - $depth = $request->query('depth', 3); - - $crawlerController = new WebCrawlController(); - return $crawlerController->crawlWebsite($request); -}); - -Route::get('/', function () { - $crawlerController = new WebCrawlController(); - return $crawlerController->index(); -}); - -Route::delete('/crawl/{id}', function ($id) { - $crawlerController = new WebCrawlController(); - return $crawlerController->destroy($id); -}); - -Route::delete('/crawl', function () { - $crawlerController = new WebCrawlController(); - return $crawlerController->destroyAll(); -}); +Route::get('/crawl', [WebCrawlController::class, 'crawlWebsite']); +Route::get('/', [WebCrawlController::class, 'index']); +Route::delete('/crawl/{id}', [WebCrawlController::class, 'destroy']); +Route::delete('/crawl', [WebCrawlController::class, 'destroyAll']);