re-order the code, added Service provider + Server it self + Request for WebCrawlRequest

2023-06-01 08:33:46 +03:00 · 2023-06-01 08:33:46 +03:00 · 231cd17902
commit 231cd17902
parent 9ac63c8db0
4 changed files with 196 additions and 157 deletions
--- a/app/Http/Controllers/WebCrawlController.php
+++ b/app/Http/Controllers/WebCrawlController.php
@ -2,158 +2,39 @@

 namespace App\Http\Controllers;

-use App\Models\WebCrawl;
-use GuzzleHttp\Client;
-use Illuminate\Http\Request;
-use Illuminate\Support\Facades\Log;
+use App\Http\Requests\WebCrawlRequest;
+use App\Services\WebCrawlerService;

 class WebCrawlController extends Controller
 {
-    protected $webClient;
+    protected $webCrawlerService;
+
+    public function __construct(WebCrawlerService $webCrawlerService)
+    {
+        $this->webCrawlerService = $webCrawlerService;
+    }

    public function index()
    {
-        $allCrawls = WebCrawl::all();
-        return response()->json($allCrawls);
+        return $this->webCrawlerService->getAllCrawls();
    }

-    public function crawlWebsite(Request $request)
+    public function crawlWebsite(WebCrawlRequest $request)
    {
        $url = $request->query('url');
        $depth = $request->query('depth', 0);
        $refresh = $request->query('refresh', false);

-        // Check if the URL is already in the database
-        $webCrawl = WebCrawl::where('url', $url)->first();
-        if ($webCrawl && !$refresh) {
-            Log::error("This URL already exists in the database $url");
-            return response()->json([
-                'error' => 'This URL already exists in the database',
-            ], 400);
-        }
-
-        $this->webClient = new Client();
-
-        // Use GuzzleHttp client to send HTTP requests
-        $response = $this->scan($url);
-        if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) {
-            $crawler = $webCrawl ?? new WebCrawl();
-            $crawler->url = $url;
-            $crawler->content = $response->getBody()->getContents();
-            $linksFromPage = $this->getLinksFromPage($crawler->content);
-            try {
-                $crawler->save();
-            } catch (\Exception $e) {
-                Log::error($e->getMessage());
-                return response()->json([
-                    'error' => 'Failed to save the URL to the database',
-                ], 500);
-            }
-            if ($depth > 0 && count($linksFromPage) > 0) {
-                $results = [];
-                foreach ($linksFromPage as $link) {
-                    $results[] = $this->crawlWebsiteRecursive($link, $depth - 1);
-                }
-                return response()->json([
-                    'message' => 'Crawling completed successfully',
-                    'data' => $url
-                ]);
-            }
-        } else {
-            Log::error("Failed to retrieve the URL $url");
-            return response()->json([
-                'error' => 'Failed to retrieve the URL',
-            ], 500);
-        }
-
-        Log::info("Crawling completed successfully For URL $url");
-        return response()->json([
-            'message' => 'Crawling completed successfully',
-        ]);
-    }
-
-    protected function crawlWebsiteRecursive($url, $depth)
-    {
-        // Check if the URL is already in the database
-        $webCrawl = WebCrawl::where('url', $url)->first();
-        if ($webCrawl) {
-            return [];
-        }
-
-        // Use GuzzleHttp client to send HTTP requests
-        $response = $this->scan($url);
-        if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) {
-            $crawler = new WebCrawl();
-            $crawler->url = $url;
-            $crawler->content = $response->getBody()->getContents();
-            $linksFromPage = $this->getLinksFromPage($crawler->content);
-            try {
-                $crawler->save();
-                Log::info("URL saved to the database $url");
-            } catch (\Exception $e) {
-                Log::error("Can't save the URL to the database $url");
-                return [];
-            }
-            if ($depth > 0 && count($linksFromPage) > 0) {
-                $results = [];
-                foreach ($linksFromPage as $link) {
-                    $results = array_merge($results, $this->crawlWebsiteRecursive($link, $depth - 1));
-                }
-                return $results;
-            }
-        }
-
-        return [];
-    }
-
-    protected function getLinksFromPage($crawlerContent)
-    {
-        $dom = new \DOMDocument();
-        @$dom->loadHTML($crawlerContent);
-        $links = $dom->getElementsByTagName('a');
-        $linksFromPage = [];
-        foreach ($links as $link) {
-            if ($link->getAttribute('href') && filter_var($link->getAttribute('href'), FILTER_VALIDATE_URL)) {
-                $linksFromPage[] = $link->getAttribute('href');
-            }
-        }
-        return $linksFromPage;
-    }
-
-    protected function scan($url)
-    {
-        try {
-            $result = $this->webClient->request('GET', $url);
-        } catch (\Exception $e) {
-            return response()->json([
-                'error' => 'Failed to retrieve the URL',
-            ], 500);
-        }
-        return $result;
+        return $this->webCrawlerService->crawlWebsite($url, $depth, $refresh);
    }

    public function destroy($id)
    {
-        $webCrawl = WebCrawl::find($id);
-        if ($webCrawl) {
-            $webCrawl->delete();
-            Log::info("Web crawl deleted successfully For ID $id");
-            return response()->json([
-                'message' => 'Web crawl deleted successfully',
-            ]);
-        }
-        Log::error("Web crawl not found For ID $id");
-        return response()->json([
-            'error' => 'Web crawl not found',
-        ], 404);
+        return $this->webCrawlerService->deleteCrawl($id);
    }

    public function destroyAll()
    {
-        WebCrawl::truncate();
-        Log::info("All web crawls deleted successfully");
-        return response()->json([
-            'message' => 'All web crawls deleted successfully',
-        ]);
+        return $this->webCrawlerService->deleteAllCrawls();
    }
 }
--- a/app/Http/Requests/WebCrawlRequest.php
+++ b/app/Http/Requests/WebCrawlRequest.php
@ -0,0 +1,23 @@
+<?php
+
+namespace App\Http\Requests;
+
+use Illuminate\Foundation\Http\FormRequest;
+
+class WebCrawlRequest extends FormRequest
+{
+    public function authorize()
+    {
+        // Add authorization logic here if needed
+        return true;
+    }
+
+    public function rules()
+    {
+        return [
+            'url' => 'required|url',
+            'depth' => 'integer|min:0',
+            'refresh' => 'integer',
+        ];
+    }
+}
--- a/app/Providers/CrawlerServiceProvider.php
+++ b/app/Providers/CrawlerServiceProvider.php
@ -2,34 +2,15 @@

 namespace App\Providers;

+use App\Services\WebCrawlerService;
 use Illuminate\Support\ServiceProvider;

-class CrawlerServiceProvider extends ServiceProvider
+class WebCrawlServiceProvider extends ServiceProvider
 {
-    /**
-     * Register services.
-     *
-     * @return void
-     */
    public function register()
    {
-        //
+        $this->app->singleton(WebCrawlerService::class, function () {
+            return new WebCrawlerService();
+        });
    }
-
-    public function crawlWebsite($url, $depth) {
-        $visitedUrls = [];
-        echo 'HERE!';die;
-    }
-
-    /**
-     * Bootstrap services.
-     *
-     * @return void
-     */
-    public function boot()
-    {
-        //
-    }
-
-
 }
--- a/app/Services/WebCrawlerService.php
+++ b/app/Services/WebCrawlerService.php
@ -0,0 +1,154 @@
+<?php
+
+namespace App\Services;
+
+use App\Models\WebCrawl;
+use GuzzleHttp\Client;
+use Illuminate\Support\Facades\Log;
+
+class WebCrawlerService
+{
+  protected $webClient;
+
+  public function __construct()
+  {
+    $this->webClient = new Client();
+  }
+
+  public function getAllCrawls()
+  {
+    $allCrawls = WebCrawl::all();
+    return response()->json($allCrawls);
+  }
+
+  public function crawlWebsite($url, $depth, $refresh)
+  {
+    // Check if the URL is already in the database
+    $webCrawl = WebCrawl::where('url', $url)->first();
+    if ($webCrawl && !$refresh) {
+      Log::error("This URL already exists in the database $url");
+      return response()->json([
+        'error' => 'This URL already exists in the database',
+      ], 400);
+    }
+
+    $response = $this->scan($url);
+    if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) {
+      $crawler = $webCrawl ?? new WebCrawl();
+      $crawler->url = $url;
+      $crawler->content = $response->getBody()->getContents();
+      $linksFromPage = $this->getLinksFromPage($crawler->content);
+      try {
+        $crawler->save();
+      } catch (\Exception $e) {
+        Log::error($e->getMessage());
+        return response()->json([
+          'error' => 'Failed to save the URL to the database',
+        ], 500);
+      }
+      if ($depth > 0 && count($linksFromPage) > 0) {
+        $results = [];
+        foreach ($linksFromPage as $link) {
+          $results[] = $this->crawlWebsiteRecursive($link, $depth - 1);
+        }
+        return response()->json([
+          'message' => 'Crawling completed successfully',
+          'data' => $url
+        ]);
+      }
+    } else {
+      Log::error("Failed to retrieve the URL $url");
+      return response()->json([
+        'error' => 'Failed to retrieve the URL',
+      ], 500);
+    }
+
+    Log::info("Crawling completed successfully For URL $url");
+    return response()->json([
+      'message' => 'Crawling completed successfully',
+    ]);
+  }
+
+  public function deleteCrawl($id)
+  {
+    $webCrawl = WebCrawl::find($id);
+    if ($webCrawl) {
+      $webCrawl->delete();
+      Log::info("Web crawl deleted successfully For ID $id");
+      return response()->json([
+        'message' => 'Web crawl deleted successfully',
+      ]);
+    }
+    Log::error("Web crawl not found For ID $id");
+    return response()->json([
+      'error' => 'Web crawl not found',
+    ], 404);
+  }
+
+  public function deleteAllCrawls()
+  {
+    WebCrawl::truncate();
+    Log::info("All web crawls deleted successfully");
+    return response()->json([
+      'message' => 'All web crawls deleted successfully',
+    ]);
+  }
+
+  protected function crawlWebsiteRecursive($url, $depth)
+  {
+    $webCrawl = WebCrawl::where('url', $url)->first();
+    if ($webCrawl) {
+      return [];
+    }
+
+    $response = $this->scan($url);
+    if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) {
+      $crawler = new WebCrawl();
+      $crawler->url = $url;
+      $crawler->content = $response->getBody()->getContents();
+      $linksFromPage = $this->getLinksFromPage($crawler->content);
+      try {
+        $crawler->save();
+        Log::info("URL saved to the database $url");
+      } catch (\Exception $e) {
+        Log::error("Can't save the URL to the database $url");
+        return [];
+      }
+      if ($depth > 0 && count($linksFromPage) > 0) {
+        $results = [];
+        foreach ($linksFromPage as $link) {
+          $results = array_merge($results, $this->crawlWebsiteRecursive($link, $depth - 1));
+        }
+        return $results;
+      }
+    }
+
+    return [];
+  }
+
+  protected function getLinksFromPage($crawlerContent)
+  {
+    $dom = new \DOMDocument();
+    @$dom->loadHTML($crawlerContent);
+    $links = $dom->getElementsByTagName('a');
+    $linksFromPage = [];
+    foreach ($links as $link) {
+      if ($link->getAttribute('href') && filter_var($link->getAttribute('href'), FILTER_VALIDATE_URL)) {
+        $linksFromPage[] = $link->getAttribute('href');
+      }
+    }
+    return $linksFromPage;
+  }
+
+  protected function scan($url)
+  {
+    try {
+      $result = $this->webClient->request('GET', $url);
+    } catch (\Exception $e) {
+      return response()->json([
+        'error' => 'Failed to retrieve the URL',
+      ], 500);
+    }
+    return $result;
+  }
+}