json($allCrawls); } public function crawlWebsite(Request $request) { $url = $request->query('url'); $depth = $request->query('depth', 1); $refresh = $request->query('refresh', false); // Check if the URL is already in the database $webCrawl = WebCrawl::where('url', $url)->first(); if ($webCrawl && !$refresh) { return response()->json([ 'error' => 'This URL already exists in the database', ], 400); } $this->webClient = new Client(); // Use GuzzleHttp client to send HTTP requests $response = $this->scan($url); if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) { $crawler = $webCrawl ?? new WebCrawl(); $crawler->url = $url; $crawler->content = $response->getBody()->getContents(); $linksFromPage = $this->getLinksFromPage($crawler->content); try { $crawler->save(); } catch (\Exception $e) { return response()->json([ 'error' => 'Failed to save the URL to the database', ], 500); } if ($depth > 0 && count($linksFromPage) > 0) { $results = []; foreach ($linksFromPage as $link) { $results[] = $this->crawlWebsiteRecursive($link, $depth - 1); } return response()->json([ 'message' => 'Crawling completed successfully', 'data' => $url ]); } } else { return response()->json([ 'error' => 'Failed to retrieve the URL', ], 500); } return response()->json([ 'message' => 'Crawling completed successfully', ]); } protected function crawlWebsiteRecursive($url, $depth) { // Check if the URL is already in the database $webCrawl = WebCrawl::where('url', $url)->first(); if ($webCrawl) { return []; } // Use GuzzleHttp client to send HTTP requests $response = $this->scan($url); if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) { $crawler = new WebCrawl(); $crawler->url = $url; $crawler->content = $response->getBody()->getContents(); $linksFromPage = $this->getLinksFromPage($crawler->content); try { $crawler->save(); } catch (\Exception $e) { return []; } if ($depth > 0 && count($linksFromPage) > 0) { $results = []; foreach ($linksFromPage as $link) { $results = array_merge($results, $this->crawlWebsiteRecursive($link, $depth - 1)); } return $results; } } return []; } protected function getLinksFromPage($crawlerContent) { $dom = new \DOMDocument(); @$dom->loadHTML($crawlerContent); $links = $dom->getElementsByTagName('a'); $linksFromPage = []; foreach ($links as $link) { if ($link->getAttribute('href') && filter_var($link->getAttribute('href'), FILTER_VALIDATE_URL)) { $linksFromPage[] = $link->getAttribute('href'); } } return $linksFromPage; } protected function scan($url) { $result = $this->webClient->request('GET', $url); return $result; } public function destroy($id) { $webCrawl = WebCrawl::find($id); if ($webCrawl) { $webCrawl->delete(); return response()->json([ 'message' => 'Web crawl deleted successfully', ]); } return response()->json([ 'error' => 'Web crawl not found', ], 404); } public function destroyAll() { WebCrawl::truncate(); return response()->json([ 'message' => 'All web crawls deleted successfully', ]); } }