Web_Crawler_API/app/Http/Controllers/WebCrawlController.php

<?php

namespace App\Http\Controllers;

use App\Models\WebCrawl;
use GuzzleHttp\Client;
use Illuminate\Http\Request;

class WebCrawlController extends Controller
{
    protected $webClient;

    public function index()
    {
        $allCrawls = WebCrawl::all();
        return response()->json($allCrawls);
    }

    public function crawlWebsite(Request $request)
    {
        $url = $request->query('url');
        $depth = $request->query('depth', 1);
        $refresh = $request->query('refresh', false);

        // Check if the URL is already in the database
        $webCrawl = WebCrawl::where('url', $url)->first();
        if ($webCrawl && !$refresh) {
            return response()->json([
                'error' => 'This URL already exists in the database',
            ], 400);
        }

        $this->webClient = new Client();

        // Use GuzzleHttp client to send HTTP requests
        $response = $this->scan($url);
        if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) {
            $crawler = $webCrawl ?? new WebCrawl();
            $crawler->url = $url;
            $crawler->content = $response->getBody()->getContents();
            $linksFromPage = $this->getLinksFromPage($crawler->content);
            try {
                $crawler->save();
            } catch (\Exception $e) {
                return response()->json([
                    'error' => 'Failed to save the URL to the database',
                ], 500);
            }
            if ($depth > 0 && count($linksFromPage) > 0) {
                $results = [];
                foreach ($linksFromPage as $link) {
                    $results[] = $this->crawlWebsiteRecursive($link, $depth - 1);
                }
                return response()->json([
                    'message' => 'Crawling completed successfully',
                    'data' => $url
                ]);
            }
        } else {
            return response()->json([
                'error' => 'Failed to retrieve the URL',
            ], 500);
        }

        return response()->json([
            'message' => 'Crawling completed successfully',
        ]);
    }

    protected function crawlWebsiteRecursive($url, $depth)
    {
        // Check if the URL is already in the database
        $webCrawl = WebCrawl::where('url', $url)->first();
        if ($webCrawl) {
            return [];
        }

        // Use GuzzleHttp client to send HTTP requests
        $response = $this->scan($url);
        if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) {
            $crawler = new WebCrawl();
            $crawler->url = $url;
            $crawler->content = $response->getBody()->getContents();
            $linksFromPage = $this->getLinksFromPage($crawler->content);
            try {
                $crawler->save();
            } catch (\Exception $e) {
                return [];
            }
            if ($depth > 0 && count($linksFromPage) > 0) {
                $results = [];
                foreach ($linksFromPage as $link) {
                    $results = array_merge($results, $this->crawlWebsiteRecursive($link, $depth - 1));
                }
                return $results;
            }
        }

        return [];
    }

    protected function getLinksFromPage($crawlerContent)
    {
        $dom = new \DOMDocument();
        @$dom->loadHTML($crawlerContent);
        $links = $dom->getElementsByTagName('a');
        $linksFromPage = [];
        foreach ($links as $link) {
            if ($link->getAttribute('href') && filter_var($link->getAttribute('href'), FILTER_VALIDATE_URL)) {
                $linksFromPage[] = $link->getAttribute('href');
            }
        }
        return $linksFromPage;
    }

    protected function scan($url)
    {
        $result = $this->webClient->request('GET', $url);
        return $result;
    }

    public function destroy($id)
    {
        $webCrawl = WebCrawl::find($id);
        if ($webCrawl) {
            $webCrawl->delete();
            return response()->json([
                'message' => 'Web crawl deleted successfully',
            ]);
        }
        return response()->json([
            'error' => 'Web crawl not found',
        ], 404);
    }

    public function destroyAll()
    {
        WebCrawl::truncate();
        return response()->json([
            'message' => 'All web crawls deleted successfully',
        ]);
    }
}