From 9f232464586f3f54d425f1a6681c1c8acde4230c Mon Sep 17 00:00:00 2001
From: Kfir Dayan <kfda89@gmail.com>
Date: Wed, 31 May 2023 13:17:19 +0300
Subject: [PATCH] README.md has added

---
 README.md                                   | 53 +++++++++++++++++++--
 app/Http/Controllers/WebCrawlController.php | 22 ++++++---
 routes/api.php                              | 33 ++-----------
 3 files changed, 70 insertions(+), 38 deletions(-)
diff --git a/README.md b/README.md
index cbdb580..1d23bab 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,55 @@
-Run the server - php artisan serve
+# Web Crawler API
 
-run mongo - run docker-compose up -d
+The Web Crawler API is a simple API that allows you to crawl websites and store the crawled data in a database. It uses GuzzleHttp to send HTTP requests and parses the HTML content to extract links from web pages. The API is built with Laravel framework.
 
-migrate - php artisan migrate
+## Features
 
+- Crawls websites and stores the crawled data in the database.
+- Supports setting the depth of the crawling process.
+- Prevents duplicate URLs from being crawled.
+- Retrieves and saves the HTML content of crawled pages.
+- Extracts valid URLs from the crawled pages.
+
+## Prerequisites
+
+- PHP >= 7.4
+- Composer
+- Laravel framework
+- MongoDB
+- Docker
+- Docker Compose
+- GuzzleHttp
+- MongoDB PHP driver (extension - mongodb.so)
+- jenssegers/mongodb package
+
+## Getting Started
+
+1. Clone the repository:
+
+   ```bash
+   git clone <repository-url>
+
+
+## Services
+  # server
+
+    Run the server - php artisan serve
+  # MongoDB
+    Run the server - php artisan serve
+    run mongo - run docker-compose up -d
+    migrate - php artisan migrate
+
+
+## Configuration
 use .env file to set up the database connection
 
+
+
+
+## API Endpoints ##
+
+  GET /api/crawl: Crawls a website and stores the crawled data in the database. Required query parameter: url. Optional query parameter: depth (default: 1).
+  GET /api: Retrieves all crawled data from the database.
+  DELETE /api/crawl/{id}: Deletes a specific crawled data record from the database.
+  DELETE /api/crawl: Deletes all crawled data records from the database.
+
diff --git a/app/Http/Controllers/WebCrawlController.php b/app/Http/Controllers/WebCrawlController.php
index 602cd5e..f62035b 100644
--- a/app/Http/Controllers/WebCrawlController.php
+++ b/app/Http/Controllers/WebCrawlController.php
@@ -20,10 +20,11 @@ public function crawlWebsite(Request $request)
     {
         $url = $request->query('url');
         $depth = $request->query('depth', 1);
+        $refresh = $request->query('refresh', false);
 
         // Check if the URL is already in the database
         $webCrawl = WebCrawl::where('url', $url)->first();
-        if ($webCrawl) {
+        if ($webCrawl && !$refresh) {
             return response()->json([
                 'error' => 'This URL already exists in the database',
             ], 400);
@@ -34,7 +35,7 @@ public function crawlWebsite(Request $request)
         // Use GuzzleHttp client to send HTTP requests
         $response = $this->scan($url);
         if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) {
-            $crawler = new WebCrawl();
+            $crawler = $webCrawl ?? new WebCrawl();
             $crawler->url = $url;
             $crawler->content = $response->getBody()->getContents();
             $linksFromPage = $this->getLinksFromPage($crawler->content);
@@ -46,9 +47,14 @@ public function crawlWebsite(Request $request)
                 ], 500);
             }
             if ($depth > 0 && count($linksFromPage) > 0) {
+                $results = [];
                 foreach ($linksFromPage as $link) {
-                    $this->crawlWebsiteRecursive($link, $depth - 1);
+                    $results[] = $this->crawlWebsiteRecursive($link, $depth - 1);
                 }
+                return response()->json([
+                    'message' => 'Crawling completed successfully',
+                    'data' => $url
+                ]);
             }
         } else {
             return response()->json([
@@ -66,7 +72,7 @@ protected function crawlWebsiteRecursive($url, $depth)
         // Check if the URL is already in the database
         $webCrawl = WebCrawl::where('url', $url)->first();
         if ($webCrawl) {
-            return;
+            return [];
         }
 
         // Use GuzzleHttp client to send HTTP requests
@@ -79,14 +85,18 @@ protected function crawlWebsiteRecursive($url, $depth)
             try {
                 $crawler->save();
             } catch (\Exception $e) {
-                return;
+                return [];
             }
             if ($depth > 0 && count($linksFromPage) > 0) {
+                $results = [];
                 foreach ($linksFromPage as $link) {
-                    $this->crawlWebsiteRecursive($link, $depth - 1);
+                    $results = array_merge($results, $this->crawlWebsiteRecursive($link, $depth - 1));
                 }
+                return $results;
             }
         }
+
+        return [];
     }
 
     protected function getLinksFromPage($crawlerContent)
diff --git a/routes/api.php b/routes/api.php
index 55bfb08..247e1ae 100644
--- a/routes/api.php
+++ b/routes/api.php
@@ -4,32 +4,7 @@
 use Illuminate\Support\Facades\Route;
 use App\Http\Controllers\WebCrawlController;
 
-Route::get('/crawl', function (Request $request) {
-    $url = $request->query('url');
-
-    if (!$url || !filter_var($url, FILTER_VALIDATE_URL)) {
-        return response()->json([
-            'error' => 'Missing required parameter `url` or invalid URL',
-        ], 400);
-    }
-
-    $depth = $request->query('depth', 3);
-
-    $crawlerController = new WebCrawlController();
-    return $crawlerController->crawlWebsite($request);
-});
-
-Route::get('/', function () {
-    $crawlerController = new WebCrawlController();
-    return $crawlerController->index();
-});
-
-Route::delete('/crawl/{id}', function ($id) {
-    $crawlerController = new WebCrawlController();
-    return $crawlerController->destroy($id);
-});
-
-Route::delete('/crawl', function () {
-    $crawlerController = new WebCrawlController();
-    return $crawlerController->destroyAll();
-});
+Route::get('/crawl', [WebCrawlController::class, 'crawlWebsite']);
+Route::get('/', [WebCrawlController::class, 'index']);
+Route::delete('/crawl/{id}', [WebCrawlController::class, 'destroy']);
+Route::delete('/crawl', [WebCrawlController::class, 'destroyAll']);