webClient = new Client(); } public function getAllCrawls() { $allCrawls = WebCrawl::all(); return response()->json($allCrawls); } public function crawlWebsite($url, $depth, $refresh) { $webCrawl = WebCrawl::where('url', $url)->first(); if ($webCrawl && !$refresh) { Log::error("This URL already exists in the database: $url"); return response()->json([ 'error' => 'This URL already exists in the database', ], 400); } $response = $this->scan($url); if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) { $crawler = $webCrawl ?? new WebCrawl(); $crawler->url = $url; $crawler->content = $response->getBody()->getContents(); $linksFromPage = $this->getLinksFromPage($crawler->content); try { $crawler->save(); } catch (\Exception $e) { Log::error($e->getMessage()); return response()->json([ 'error' => 'Failed to save the URL to the database', ], 500); } if ($depth > 0 && count($linksFromPage) > 0) { $results = []; foreach ($linksFromPage as $link) { $results[] = $this->crawlWebsiteRecursive($link, $depth - 1); } return response()->json([ 'message' => 'Crawling completed successfully', 'data' => $url ]); } } else { Log::error("Failed to retrieve the URL: $url"); return response()->json([ 'error' => 'Failed to retrieve the URL', ], 500); } Log::info("Crawling completed successfully for URL: $url"); return response()->json([ 'message' => 'Crawling completed successfully', ]); } public function deleteCrawl($id) { $webCrawl = WebCrawl::find($id); if ($webCrawl) { $webCrawl->delete(); Log::info("Web crawl deleted successfully for ID: $id"); return response()->json([ 'message' => 'Web crawl deleted successfully', ]); } Log::error("Web crawl not found for ID: $id"); return response()->json([ 'error' => 'Web crawl not found', ], 404); } public function deleteAllCrawls() { WebCrawl::truncate(); Log::info("All web crawls deleted successfully"); return response()->json([ 'message' => 'All web crawls deleted successfully', ]); } protected function crawlWebsiteRecursive($url, $depth) { $webCrawl = WebCrawl::where('url', $url)->first(); if ($webCrawl) { return []; } $response = $this->scan($url); if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) { $crawler = new WebCrawl(); $crawler->url = $url; $crawler->content = $response->getBody()->getContents(); $linksFromPage = $this->getLinksFromPage($crawler->content); try { $crawler->save(); Log::info("URL saved to the database: $url"); } catch (\Exception $e) { Log::error("Can't save the URL to the database: $url"); return []; } if ($depth > 0 && count($linksFromPage) > 0) { $results = []; foreach ($linksFromPage as $link) { $results = array_merge($results, $this->crawlWebsiteRecursive($link, $depth - 1)); } return $results; } } return []; } protected function getLinksFromPage($crawlerContent) { $dom = new \DOMDocument(); @$dom->loadHTML($crawlerContent); $links = $dom->getElementsByTagName('a'); $linksFromPage = []; foreach ($links as $link) { if ($link->getAttribute('href') && filter_var($link->getAttribute('href'), FILTER_VALIDATE_URL)) { $linksFromPage[] = $link->getAttribute('href'); } } return $linksFromPage; } protected function scan($url) { try { $result = $this->webClient->request('GET', $url); } catch (\Exception $e) { return response()->json([ 'error' => 'Failed to retrieve the URL', ], 500); } return $result; } }