Web_Crawler_API/app/Http/Controllers/WebCrawlController.php

<?php

namespace App\Http\Controllers;

use App\Models\WebCrawl;
use GuzzleHttp\Client;
use Illuminate\Http\Request;
use Illuminate\Support\Facades\Log;

class WebCrawlController extends Controller
{
    protected $webClient;

    public function index()
    {
        $allCrawls = WebCrawl::all();
        return response()->json($allCrawls);
    }

    public function crawlWebsite(Request $request)
    {
        $url = $request->query('url');
        $depth = $request->query('depth', 0);
        $refresh = $request->query('refresh', false);

        // Check if the URL is already in the database
        $webCrawl = WebCrawl::where('url', $url)->first();
        if ($webCrawl && !$refresh) {
            Log::error("This URL already exists in the database $url");
            return response()->json([
                'error' => 'This URL already exists in the database',
            ], 400);
        }

        $this->webClient = new Client();

        // Use GuzzleHttp client to send HTTP requests
        $response = $this->scan($url);
        if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) {
            $crawler = $webCrawl ?? new WebCrawl();
            $crawler->url = $url;
            $crawler->content = $response->getBody()->getContents();
            $linksFromPage = $this->getLinksFromPage($crawler->content);
            try {
                $crawler->save();
            } catch (\Exception $e) {
                Log::error($e->getMessage());
                return response()->json([
                    'error' => 'Failed to save the URL to the database',
                ], 500);
            }
            if ($depth > 0 && count($linksFromPage) > 0) {
                $results = [];
                foreach ($linksFromPage as $link) {
                    $results[] = $this->crawlWebsiteRecursive($link, $depth - 1);
                }
                return response()->json([
                    'message' => 'Crawling completed successfully',
                    'data' => $url
                ]);
            }
        } else {
            Log::error("Failed to retrieve the URL $url");
            return response()->json([
                'error' => 'Failed to retrieve the URL',
            ], 500);
        }

        Log::info("Crawling completed successfully For URL $url");
        return response()->json([
            'message' => 'Crawling completed successfully',
        ]);
    }

    protected function crawlWebsiteRecursive($url, $depth)
    {
        // Check if the URL is already in the database
        $webCrawl = WebCrawl::where('url', $url)->first();
        if ($webCrawl) {
            return [];
        }

        // Use GuzzleHttp client to send HTTP requests
        $response = $this->scan($url);
        if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) {
            $crawler = new WebCrawl();
            $crawler->url = $url;
            $crawler->content = $response->getBody()->getContents();
            $linksFromPage = $this->getLinksFromPage($crawler->content);
            try {
                $crawler->save();
                Log::info("URL saved to the database $url");
            } catch (\Exception $e) {
                Log::error("Can't save the URL to the database $url");
                return [];
            }
            if ($depth > 0 && count($linksFromPage) > 0) {
                $results = [];
                foreach ($linksFromPage as $link) {
                    $results = array_merge($results, $this->crawlWebsiteRecursive($link, $depth - 1));
                }
                return $results;
            }
        }

        return [];
    }

    protected function getLinksFromPage($crawlerContent)
    {
        $dom = new \DOMDocument();
        @$dom->loadHTML($crawlerContent);
        $links = $dom->getElementsByTagName('a');
        $linksFromPage = [];
        foreach ($links as $link) {
            if ($link->getAttribute('href') && filter_var($link->getAttribute('href'), FILTER_VALIDATE_URL)) {
                $linksFromPage[] = $link->getAttribute('href');
            }
        }
        return $linksFromPage;
    }

    protected function scan($url)
    {
        try {
            $result = $this->webClient->request('GET', $url);
        } catch (\Exception $e) {
            return response()->json([
                'error' => 'Failed to retrieve the URL',
            ], 500);
        }
        return $result;
    }

    public function destroy($id)
    {
        $webCrawl = WebCrawl::find($id);
        if ($webCrawl) {
            $webCrawl->delete();
            Log::info("Web crawl deleted successfully For ID $id");
            return response()->json([
                'message' => 'Web crawl deleted successfully',
            ]);
        }
        Log::error("Web crawl not found For ID $id");
        return response()->json([
            'error' => 'Web crawl not found',
        ], 404);
    }

    public function destroyAll()
    {
        WebCrawl::truncate();
        Log::info("All web crawls deleted successfully");
        return response()->json([
            'message' => 'All web crawls deleted successfully',
        ]);
    }
}
done with migration file 2023-05-30 14:29:46 +00:00			`<?php`

			`namespace App\Http\Controllers;`

			`use App\Models\WebCrawl;`
work in progress 2023-05-30 17:19:53 +00:00			`use GuzzleHttp\Client;`
refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`use Illuminate\Http\Request;`
added logs 2023-05-31 10:27:00 +00:00			`use Illuminate\Support\Facades\Log;`
done with migration file 2023-05-30 14:29:46 +00:00
			`class WebCrawlController extends Controller`
			`{`
refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`protected $webClient;`
done with migration file 2023-05-30 14:29:46 +00:00
			`public function index()`
work in progress 2023-05-30 17:19:53 +00:00			`{`
			`$allCrawls = WebCrawl::all();`
			`return response()->json($allCrawls);`
			`}`

refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`public function crawlWebsite(Request $request)`
			`{`
			`$url = $request->query('url');`
done server 2023-05-31 11:52:51 +00:00			`$depth = $request->query('depth', 0);`
README.md has added 2023-05-31 10:17:19 +00:00			`$refresh = $request->query('refresh', false);`
work in progress 2023-05-30 17:19:53 +00:00
refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`// Check if the URL is already in the database`
			`$webCrawl = WebCrawl::where('url', $url)->first();`
README.md has added 2023-05-31 10:17:19 +00:00			`if ($webCrawl && !$refresh) {`
added logs 2023-05-31 10:27:00 +00:00			`Log::error("This URL already exists in the database $url");`
refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`return response()->json([`
			`'error' => 'This URL already exists in the database',`
			`], 400);`
work in progress 2023-05-30 17:19:53 +00:00			`}`

refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`$this->webClient = new Client();`
work in progress 2023-05-30 17:19:53 +00:00
refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`// Use GuzzleHttp client to send HTTP requests`
			`$response = $this->scan($url);`
			`if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) {`
README.md has added 2023-05-31 10:17:19 +00:00			`$crawler = $webCrawl ?? new WebCrawl();`
refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`$crawler->url = $url;`
			`$crawler->content = $response->getBody()->getContents();`
			`$linksFromPage = $this->getLinksFromPage($crawler->content);`
			`try {`
			`$crawler->save();`
			`} catch (\Exception $e) {`
added logs 2023-05-31 10:27:00 +00:00			`Log::error($e->getMessage());`
refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`return response()->json([`
			`'error' => 'Failed to save the URL to the database',`
			`], 500);`
			`}`
			`if ($depth > 0 && count($linksFromPage) > 0) {`
README.md has added 2023-05-31 10:17:19 +00:00			`$results = [];`
refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`foreach ($linksFromPage as $link) {`
README.md has added 2023-05-31 10:17:19 +00:00			`$results[] = $this->crawlWebsiteRecursive($link, $depth - 1);`
refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`}`
README.md has added 2023-05-31 10:17:19 +00:00			`return response()->json([`
			`'message' => 'Crawling completed successfully',`
			`'data' => $url`
			`]);`
refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`}`
			`} else {`
added logs 2023-05-31 10:27:00 +00:00			`Log::error("Failed to retrieve the URL $url");`
refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`return response()->json([`
			`'error' => 'Failed to retrieve the URL',`
			`], 500);`
			`}`
work in progress 2023-05-30 17:19:53 +00:00
added logs 2023-05-31 10:27:00 +00:00			`Log::info("Crawling completed successfully For URL $url");`
refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`return response()->json([`
			`'message' => 'Crawling completed successfully',`
			`]);`
work in progress 2023-05-30 17:19:53 +00:00			`}`

refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`protected function crawlWebsiteRecursive($url, $depth)`
work in progress 2023-05-30 17:19:53 +00:00			`{`
refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`// Check if the URL is already in the database`
work in progress 2023-05-30 17:19:53 +00:00			`$webCrawl = WebCrawl::where('url', $url)->first();`
			`if ($webCrawl) {`
README.md has added 2023-05-31 10:17:19 +00:00			`return [];`
work in progress 2023-05-30 17:19:53 +00:00			`}`
done with migration file 2023-05-30 14:29:46 +00:00
refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`// Use GuzzleHttp client to send HTTP requests`
			`$response = $this->scan($url);`
			`if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) {`
			`$crawler = new WebCrawl();`
			`$crawler->url = $url;`
			`$crawler->content = $response->getBody()->getContents();`
			`$linksFromPage = $this->getLinksFromPage($crawler->content);`
			`try {`
			`$crawler->save();`
added logs 2023-05-31 10:27:00 +00:00			`Log::info("URL saved to the database $url");`
refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`} catch (\Exception $e) {`
added logs 2023-05-31 10:27:00 +00:00			`Log::error("Can't save the URL to the database $url");`
README.md has added 2023-05-31 10:17:19 +00:00			`return [];`
refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`}`
			`if ($depth > 0 && count($linksFromPage) > 0) {`
README.md has added 2023-05-31 10:17:19 +00:00			`$results = [];`
refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`foreach ($linksFromPage as $link) {`
README.md has added 2023-05-31 10:17:19 +00:00			`$results = array_merge($results, $this->crawlWebsiteRecursive($link, $depth - 1));`
refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`}`
README.md has added 2023-05-31 10:17:19 +00:00			`return $results;`
refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`}`
			`}`
README.md has added 2023-05-31 10:17:19 +00:00
			`return [];`
done with migration file 2023-05-30 14:29:46 +00:00			`}`

refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`protected function getLinksFromPage($crawlerContent)`
done with migration file 2023-05-30 14:29:46 +00:00			`{`
refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`$dom = new \DOMDocument();`
			`@$dom->loadHTML($crawlerContent);`
			`$links = $dom->getElementsByTagName('a');`
			`$linksFromPage = [];`
			`foreach ($links as $link) {`
			`if ($link->getAttribute('href') && filter_var($link->getAttribute('href'), FILTER_VALIDATE_URL)) {`
			`$linksFromPage[] = $link->getAttribute('href');`
			`}`
			`}`
			`return $linksFromPage;`
done with migration file 2023-05-30 14:29:46 +00:00			`}`

refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`protected function scan($url)`
done with migration file 2023-05-30 14:29:46 +00:00			`{`
done server 2023-05-31 11:52:51 +00:00			`try {`
			`$result = $this->webClient->request('GET', $url);`
			`} catch (\Exception $e) {`
			`return response()->json([`
			`'error' => 'Failed to retrieve the URL',`
			`], 500);`
			`}`
refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`return $result;`
done with migration file 2023-05-30 14:29:46 +00:00			`}`

work in progress 2023-05-30 17:19:53 +00:00			`public function destroy($id)`
done with migration file 2023-05-30 14:29:46 +00:00			`{`
refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`$webCrawl = WebCrawl::find($id);`
work in progress 2023-05-30 17:19:53 +00:00			`if ($webCrawl) {`
			`$webCrawl->delete();`
added logs 2023-05-31 10:27:00 +00:00			`Log::info("Web crawl deleted successfully For ID $id");`
refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`return response()->json([`
			`'message' => 'Web crawl deleted successfully',`
			`]);`
work in progress 2023-05-30 17:19:53 +00:00			`}`
added logs 2023-05-31 10:27:00 +00:00			`Log::error("Web crawl not found For ID $id");`
refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`return response()->json([`
			`'error' => 'Web crawl not found',`
			`], 404);`
			`}`

			`public function destroyAll()`
			`{`
			`WebCrawl::truncate();`
added logs 2023-05-31 10:27:00 +00:00			`Log::info("All web crawls deleted successfully");`
refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl 2023-05-31 09:35:41 +00:00			`return response()->json([`
			`'message' => 'All web crawls deleted successfully',`
			`]);`
done with migration file 2023-05-30 14:29:46 +00:00			`}`
			`}`