From 231cd179029a7004423fc7933e42e35999b59132 Mon Sep 17 00:00:00 2001 From: Kfir Dayan Date: Thu, 1 Jun 2023 08:33:46 +0300 Subject: [PATCH] re-order the code, added Service provider + Server it self + Request for WebCrawlRequest --- app/Http/Controllers/WebCrawlController.php | 145 ++---------------- app/Http/Requests/WebCrawlRequest.php | 23 +++ app/Providers/CrawlerServiceProvider.php | 31 +--- app/Services/WebCrawlerService.php | 154 ++++++++++++++++++++ 4 files changed, 196 insertions(+), 157 deletions(-) create mode 100644 app/Http/Requests/WebCrawlRequest.php create mode 100644 app/Services/WebCrawlerService.php diff --git a/app/Http/Controllers/WebCrawlController.php b/app/Http/Controllers/WebCrawlController.php index e22c4ca..3a153a3 100644 --- a/app/Http/Controllers/WebCrawlController.php +++ b/app/Http/Controllers/WebCrawlController.php @@ -2,158 +2,39 @@ namespace App\Http\Controllers; -use App\Models\WebCrawl; -use GuzzleHttp\Client; -use Illuminate\Http\Request; -use Illuminate\Support\Facades\Log; +use App\Http\Requests\WebCrawlRequest; +use App\Services\WebCrawlerService; class WebCrawlController extends Controller { - protected $webClient; + protected $webCrawlerService; + + public function __construct(WebCrawlerService $webCrawlerService) + { + $this->webCrawlerService = $webCrawlerService; + } public function index() { - $allCrawls = WebCrawl::all(); - return response()->json($allCrawls); + return $this->webCrawlerService->getAllCrawls(); } - public function crawlWebsite(Request $request) + public function crawlWebsite(WebCrawlRequest $request) { $url = $request->query('url'); $depth = $request->query('depth', 0); $refresh = $request->query('refresh', false); - // Check if the URL is already in the database - $webCrawl = WebCrawl::where('url', $url)->first(); - if ($webCrawl && !$refresh) { - Log::error("This URL already exists in the database $url"); - return response()->json([ - 'error' => 'This URL already exists in the database', - ], 400); - } - - $this->webClient = new Client(); - - // Use GuzzleHttp client to send HTTP requests - $response = $this->scan($url); - if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) { - $crawler = $webCrawl ?? new WebCrawl(); - $crawler->url = $url; - $crawler->content = $response->getBody()->getContents(); - $linksFromPage = $this->getLinksFromPage($crawler->content); - try { - $crawler->save(); - } catch (\Exception $e) { - Log::error($e->getMessage()); - return response()->json([ - 'error' => 'Failed to save the URL to the database', - ], 500); - } - if ($depth > 0 && count($linksFromPage) > 0) { - $results = []; - foreach ($linksFromPage as $link) { - $results[] = $this->crawlWebsiteRecursive($link, $depth - 1); - } - return response()->json([ - 'message' => 'Crawling completed successfully', - 'data' => $url - ]); - } - } else { - Log::error("Failed to retrieve the URL $url"); - return response()->json([ - 'error' => 'Failed to retrieve the URL', - ], 500); - } - - Log::info("Crawling completed successfully For URL $url"); - return response()->json([ - 'message' => 'Crawling completed successfully', - ]); - } - - protected function crawlWebsiteRecursive($url, $depth) - { - // Check if the URL is already in the database - $webCrawl = WebCrawl::where('url', $url)->first(); - if ($webCrawl) { - return []; - } - - // Use GuzzleHttp client to send HTTP requests - $response = $this->scan($url); - if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) { - $crawler = new WebCrawl(); - $crawler->url = $url; - $crawler->content = $response->getBody()->getContents(); - $linksFromPage = $this->getLinksFromPage($crawler->content); - try { - $crawler->save(); - Log::info("URL saved to the database $url"); - } catch (\Exception $e) { - Log::error("Can't save the URL to the database $url"); - return []; - } - if ($depth > 0 && count($linksFromPage) > 0) { - $results = []; - foreach ($linksFromPage as $link) { - $results = array_merge($results, $this->crawlWebsiteRecursive($link, $depth - 1)); - } - return $results; - } - } - - return []; - } - - protected function getLinksFromPage($crawlerContent) - { - $dom = new \DOMDocument(); - @$dom->loadHTML($crawlerContent); - $links = $dom->getElementsByTagName('a'); - $linksFromPage = []; - foreach ($links as $link) { - if ($link->getAttribute('href') && filter_var($link->getAttribute('href'), FILTER_VALIDATE_URL)) { - $linksFromPage[] = $link->getAttribute('href'); - } - } - return $linksFromPage; - } - - protected function scan($url) - { - try { - $result = $this->webClient->request('GET', $url); - } catch (\Exception $e) { - return response()->json([ - 'error' => 'Failed to retrieve the URL', - ], 500); - } - return $result; + return $this->webCrawlerService->crawlWebsite($url, $depth, $refresh); } public function destroy($id) { - $webCrawl = WebCrawl::find($id); - if ($webCrawl) { - $webCrawl->delete(); - Log::info("Web crawl deleted successfully For ID $id"); - return response()->json([ - 'message' => 'Web crawl deleted successfully', - ]); - } - Log::error("Web crawl not found For ID $id"); - return response()->json([ - 'error' => 'Web crawl not found', - ], 404); + return $this->webCrawlerService->deleteCrawl($id); } public function destroyAll() { - WebCrawl::truncate(); - Log::info("All web crawls deleted successfully"); - return response()->json([ - 'message' => 'All web crawls deleted successfully', - ]); + return $this->webCrawlerService->deleteAllCrawls(); } } diff --git a/app/Http/Requests/WebCrawlRequest.php b/app/Http/Requests/WebCrawlRequest.php new file mode 100644 index 0000000..22dfeca --- /dev/null +++ b/app/Http/Requests/WebCrawlRequest.php @@ -0,0 +1,23 @@ + 'required|url', + 'depth' => 'integer|min:0', + 'refresh' => 'integer', + ]; + } +} diff --git a/app/Providers/CrawlerServiceProvider.php b/app/Providers/CrawlerServiceProvider.php index 8924dda..06c68ed 100644 --- a/app/Providers/CrawlerServiceProvider.php +++ b/app/Providers/CrawlerServiceProvider.php @@ -2,34 +2,15 @@ namespace App\Providers; +use App\Services\WebCrawlerService; use Illuminate\Support\ServiceProvider; -class CrawlerServiceProvider extends ServiceProvider +class WebCrawlServiceProvider extends ServiceProvider { - /** - * Register services. - * - * @return void - */ public function register() { - // + $this->app->singleton(WebCrawlerService::class, function () { + return new WebCrawlerService(); + }); } - - public function crawlWebsite($url, $depth) { - $visitedUrls = []; - echo 'HERE!';die; - } - - /** - * Bootstrap services. - * - * @return void - */ - public function boot() - { - // - } - - -} +} \ No newline at end of file diff --git a/app/Services/WebCrawlerService.php b/app/Services/WebCrawlerService.php new file mode 100644 index 0000000..211b7f3 --- /dev/null +++ b/app/Services/WebCrawlerService.php @@ -0,0 +1,154 @@ +webClient = new Client(); + } + + public function getAllCrawls() + { + $allCrawls = WebCrawl::all(); + return response()->json($allCrawls); + } + + public function crawlWebsite($url, $depth, $refresh) + { + // Check if the URL is already in the database + $webCrawl = WebCrawl::where('url', $url)->first(); + if ($webCrawl && !$refresh) { + Log::error("This URL already exists in the database $url"); + return response()->json([ + 'error' => 'This URL already exists in the database', + ], 400); + } + + $response = $this->scan($url); + if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) { + $crawler = $webCrawl ?? new WebCrawl(); + $crawler->url = $url; + $crawler->content = $response->getBody()->getContents(); + $linksFromPage = $this->getLinksFromPage($crawler->content); + try { + $crawler->save(); + } catch (\Exception $e) { + Log::error($e->getMessage()); + return response()->json([ + 'error' => 'Failed to save the URL to the database', + ], 500); + } + if ($depth > 0 && count($linksFromPage) > 0) { + $results = []; + foreach ($linksFromPage as $link) { + $results[] = $this->crawlWebsiteRecursive($link, $depth - 1); + } + return response()->json([ + 'message' => 'Crawling completed successfully', + 'data' => $url + ]); + } + } else { + Log::error("Failed to retrieve the URL $url"); + return response()->json([ + 'error' => 'Failed to retrieve the URL', + ], 500); + } + + Log::info("Crawling completed successfully For URL $url"); + return response()->json([ + 'message' => 'Crawling completed successfully', + ]); + } + + public function deleteCrawl($id) + { + $webCrawl = WebCrawl::find($id); + if ($webCrawl) { + $webCrawl->delete(); + Log::info("Web crawl deleted successfully For ID $id"); + return response()->json([ + 'message' => 'Web crawl deleted successfully', + ]); + } + Log::error("Web crawl not found For ID $id"); + return response()->json([ + 'error' => 'Web crawl not found', + ], 404); + } + + public function deleteAllCrawls() + { + WebCrawl::truncate(); + Log::info("All web crawls deleted successfully"); + return response()->json([ + 'message' => 'All web crawls deleted successfully', + ]); + } + + protected function crawlWebsiteRecursive($url, $depth) + { + $webCrawl = WebCrawl::where('url', $url)->first(); + if ($webCrawl) { + return []; + } + + $response = $this->scan($url); + if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) { + $crawler = new WebCrawl(); + $crawler->url = $url; + $crawler->content = $response->getBody()->getContents(); + $linksFromPage = $this->getLinksFromPage($crawler->content); + try { + $crawler->save(); + Log::info("URL saved to the database $url"); + } catch (\Exception $e) { + Log::error("Can't save the URL to the database $url"); + return []; + } + if ($depth > 0 && count($linksFromPage) > 0) { + $results = []; + foreach ($linksFromPage as $link) { + $results = array_merge($results, $this->crawlWebsiteRecursive($link, $depth - 1)); + } + return $results; + } + } + + return []; + } + + protected function getLinksFromPage($crawlerContent) + { + $dom = new \DOMDocument(); + @$dom->loadHTML($crawlerContent); + $links = $dom->getElementsByTagName('a'); + $linksFromPage = []; + foreach ($links as $link) { + if ($link->getAttribute('href') && filter_var($link->getAttribute('href'), FILTER_VALIDATE_URL)) { + $linksFromPage[] = $link->getAttribute('href'); + } + } + return $linksFromPage; + } + + protected function scan($url) + { + try { + $result = $this->webClient->request('GET', $url); + } catch (\Exception $e) { + return response()->json([ + 'error' => 'Failed to retrieve the URL', + ], 500); + } + return $result; + } +}