README.md has added

2023-05-31 13:17:19 +03:00 · 2023-05-31 13:17:19 +03:00 · 9f23246458
commit 9f23246458
parent 6a7eb59295
3 changed files with 70 additions and 38 deletions
--- a/README.md
+++ b/README.md
@ -1,8 +1,55 @@
-Run the server - php artisan serve
+# Web Crawler API
-run mongo - run docker-compose up -d
+The Web Crawler API is a simple API that allows you to crawl websites and store the crawled data in a database. It uses GuzzleHttp to send HTTP requests and parses the HTML content to extract links from web pages. The API is built with Laravel framework.
-migrate - php artisan migrate
+## Features
 - Crawls websites and stores the crawled data in the database.
 - Supports setting the depth of the crawling process.
 - Prevents duplicate URLs from being crawled.
 - Retrieves and saves the HTML content of crawled pages.
 - Extracts valid URLs from the crawled pages.
 ## Prerequisites
 - PHP >= 7.4
 - Composer
 - Laravel framework
 - MongoDB
 - Docker
 - Docker Compose
 - GuzzleHttp
 - MongoDB PHP driver (extension - mongodb.so)
 - jenssegers/mongodb package
 ## Getting Started
 1. Clone the repository:
   ```bash
   git clone <repository-url>
 ## Services
  # server
    Run the server - php artisan serve
  # MongoDB
    Run the server - php artisan serve
    run mongo - run docker-compose up -d
    migrate - php artisan migrate
 ## Configuration
 use .env file to set up the database connection
 ## API Endpoints ##
  GET /api/crawl: Crawls a website and stores the crawled data in the database. Required query parameter: url. Optional query parameter: depth (default: 1).
  GET /api: Retrieves all crawled data from the database.
  DELETE /api/crawl/{id}: Deletes a specific crawled data record from the database.
  DELETE /api/crawl: Deletes all crawled data records from the database.
--- a/app/Http/Controllers/WebCrawlController.php
+++ b/app/Http/Controllers/WebCrawlController.php
@ -20,10 +20,11 @@ public function crawlWebsite(Request $request)
    {
        $url = $request->query('url');
        $depth = $request->query('depth', 1);
        $refresh = $request->query('refresh', false);
        // Check if the URL is already in the database
        $webCrawl = WebCrawl::where('url', $url)->first();
-        if ($webCrawl) {
+        if ($webCrawl && !$refresh) {
            return response()->json([
                'error' => 'This URL already exists in the database',
            ], 400);
@ -34,7 +35,7 @@ public function crawlWebsite(Request $request)
        // Use GuzzleHttp client to send HTTP requests
        $response = $this->scan($url);
        if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) {
-            $crawler = new WebCrawl();
+            $crawler = $webCrawl ?? new WebCrawl();
            $crawler->url = $url;
            $crawler->content = $response->getBody()->getContents();
            $linksFromPage = $this->getLinksFromPage($crawler->content);
@ -46,9 +47,14 @@ public function crawlWebsite(Request $request)
                ], 500);
            }
            if ($depth > 0 && count($linksFromPage) > 0) {
                $results = [];
                foreach ($linksFromPage as $link) {
-                    $this->crawlWebsiteRecursive($link, $depth - 1);
+                    $results[] = $this->crawlWebsiteRecursive($link, $depth - 1);
                }
                return response()->json([
                    'message' => 'Crawling completed successfully',
                    'data' => $url
                ]);
            }
        } else {
            return response()->json([
@ -66,7 +72,7 @@ protected function crawlWebsiteRecursive($url, $depth)
        // Check if the URL is already in the database
        $webCrawl = WebCrawl::where('url', $url)->first();
        if ($webCrawl) {
-            return;
+            return [];
        }
        // Use GuzzleHttp client to send HTTP requests
@ -79,14 +85,18 @@ protected function crawlWebsiteRecursive($url, $depth)
            try {
                $crawler->save();
            } catch (\Exception $e) {
-                return;
+                return [];
            }
            if ($depth > 0 && count($linksFromPage) > 0) {
                $results = [];
                foreach ($linksFromPage as $link) {
-                    $this->crawlWebsiteRecursive($link, $depth - 1);
+                    $results = array_merge($results, $this->crawlWebsiteRecursive($link, $depth - 1));
                }
                return $results;
            }
        }
        return [];
    }
    protected function getLinksFromPage($crawlerContent)
--- a/routes/api.php
+++ b/routes/api.php
@ -4,32 +4,7 @@
 use Illuminate\Support\Facades\Route;
 use App\Http\Controllers\WebCrawlController;
-Route::get('/crawl', function (Request $request) {
+Route::get('/crawl', [WebCrawlController::class, 'crawlWebsite']);
-    $url = $request->query('url');
+Route::get('/', [WebCrawlController::class, 'index']);
-
+Route::delete('/crawl/{id}', [WebCrawlController::class, 'destroy']);
-    if (!$url || !filter_var($url, FILTER_VALIDATE_URL)) {
+Route::delete('/crawl', [WebCrawlController::class, 'destroyAll']);
        return response()->json([
            'error' => 'Missing required parameter `url` or invalid URL',
        ], 400);
    }
    $depth = $request->query('depth', 3);
    $crawlerController = new WebCrawlController();
    return $crawlerController->crawlWebsite($request);
 });
 Route::get('/', function () {
    $crawlerController = new WebCrawlController();
    return $crawlerController->index();
 });
 Route::delete('/crawl/{id}', function ($id) {
    $crawlerController = new WebCrawlController();
    return $crawlerController->destroy($id);
 });
 Route::delete('/crawl', function () {
    $crawlerController = new WebCrawlController();
    return $crawlerController->destroyAll();
 });