2023-05-30 14:29:46 +00:00
|
|
|
<?php
|
|
|
|
|
|
|
|
namespace App\Http\Controllers;
|
|
|
|
|
|
|
|
use App\Models\WebCrawl;
|
2023-05-30 17:19:53 +00:00
|
|
|
use GuzzleHttp\Client;
|
2023-05-31 09:35:41 +00:00
|
|
|
use Illuminate\Http\Request;
|
2023-05-30 14:29:46 +00:00
|
|
|
|
|
|
|
class WebCrawlController extends Controller
|
|
|
|
{
|
2023-05-31 09:35:41 +00:00
|
|
|
protected $webClient;
|
2023-05-30 14:29:46 +00:00
|
|
|
|
|
|
|
public function index()
|
2023-05-30 17:19:53 +00:00
|
|
|
{
|
|
|
|
$allCrawls = WebCrawl::all();
|
|
|
|
return response()->json($allCrawls);
|
|
|
|
}
|
|
|
|
|
2023-05-31 09:35:41 +00:00
|
|
|
public function crawlWebsite(Request $request)
|
|
|
|
{
|
|
|
|
$url = $request->query('url');
|
|
|
|
$depth = $request->query('depth', 1);
|
2023-05-30 17:19:53 +00:00
|
|
|
|
2023-05-31 09:35:41 +00:00
|
|
|
// Check if the URL is already in the database
|
|
|
|
$webCrawl = WebCrawl::where('url', $url)->first();
|
|
|
|
if ($webCrawl) {
|
|
|
|
return response()->json([
|
|
|
|
'error' => 'This URL already exists in the database',
|
|
|
|
], 400);
|
2023-05-30 17:19:53 +00:00
|
|
|
}
|
|
|
|
|
2023-05-31 09:35:41 +00:00
|
|
|
$this->webClient = new Client();
|
2023-05-30 17:19:53 +00:00
|
|
|
|
2023-05-31 09:35:41 +00:00
|
|
|
// Use GuzzleHttp client to send HTTP requests
|
|
|
|
$response = $this->scan($url);
|
|
|
|
if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) {
|
|
|
|
$crawler = new WebCrawl();
|
|
|
|
$crawler->url = $url;
|
|
|
|
$crawler->content = $response->getBody()->getContents();
|
|
|
|
$linksFromPage = $this->getLinksFromPage($crawler->content);
|
|
|
|
try {
|
|
|
|
$crawler->save();
|
|
|
|
} catch (\Exception $e) {
|
|
|
|
return response()->json([
|
|
|
|
'error' => 'Failed to save the URL to the database',
|
|
|
|
], 500);
|
|
|
|
}
|
|
|
|
if ($depth > 0 && count($linksFromPage) > 0) {
|
|
|
|
foreach ($linksFromPage as $link) {
|
|
|
|
$this->crawlWebsiteRecursive($link, $depth - 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
return response()->json([
|
|
|
|
'error' => 'Failed to retrieve the URL',
|
|
|
|
], 500);
|
|
|
|
}
|
2023-05-30 17:19:53 +00:00
|
|
|
|
2023-05-31 09:35:41 +00:00
|
|
|
return response()->json([
|
|
|
|
'message' => 'Crawling completed successfully',
|
|
|
|
]);
|
2023-05-30 17:19:53 +00:00
|
|
|
}
|
|
|
|
|
2023-05-31 09:35:41 +00:00
|
|
|
protected function crawlWebsiteRecursive($url, $depth)
|
2023-05-30 17:19:53 +00:00
|
|
|
{
|
2023-05-31 09:35:41 +00:00
|
|
|
// Check if the URL is already in the database
|
2023-05-30 17:19:53 +00:00
|
|
|
$webCrawl = WebCrawl::where('url', $url)->first();
|
|
|
|
if ($webCrawl) {
|
2023-05-31 09:35:41 +00:00
|
|
|
return;
|
2023-05-30 17:19:53 +00:00
|
|
|
}
|
2023-05-30 14:29:46 +00:00
|
|
|
|
2023-05-31 09:35:41 +00:00
|
|
|
// Use GuzzleHttp client to send HTTP requests
|
|
|
|
$response = $this->scan($url);
|
|
|
|
if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) {
|
|
|
|
$crawler = new WebCrawl();
|
|
|
|
$crawler->url = $url;
|
|
|
|
$crawler->content = $response->getBody()->getContents();
|
|
|
|
$linksFromPage = $this->getLinksFromPage($crawler->content);
|
|
|
|
try {
|
|
|
|
$crawler->save();
|
|
|
|
} catch (\Exception $e) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if ($depth > 0 && count($linksFromPage) > 0) {
|
|
|
|
foreach ($linksFromPage as $link) {
|
|
|
|
$this->crawlWebsiteRecursive($link, $depth - 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2023-05-30 14:29:46 +00:00
|
|
|
}
|
|
|
|
|
2023-05-31 09:35:41 +00:00
|
|
|
protected function getLinksFromPage($crawlerContent)
|
2023-05-30 14:29:46 +00:00
|
|
|
{
|
2023-05-31 09:35:41 +00:00
|
|
|
$dom = new \DOMDocument();
|
|
|
|
@$dom->loadHTML($crawlerContent);
|
|
|
|
$links = $dom->getElementsByTagName('a');
|
|
|
|
$linksFromPage = [];
|
|
|
|
foreach ($links as $link) {
|
|
|
|
if ($link->getAttribute('href') && filter_var($link->getAttribute('href'), FILTER_VALIDATE_URL)) {
|
|
|
|
$linksFromPage[] = $link->getAttribute('href');
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return $linksFromPage;
|
2023-05-30 14:29:46 +00:00
|
|
|
}
|
|
|
|
|
2023-05-31 09:35:41 +00:00
|
|
|
protected function scan($url)
|
2023-05-30 14:29:46 +00:00
|
|
|
{
|
2023-05-31 09:35:41 +00:00
|
|
|
$result = $this->webClient->request('GET', $url);
|
|
|
|
return $result;
|
2023-05-30 14:29:46 +00:00
|
|
|
}
|
|
|
|
|
2023-05-30 17:19:53 +00:00
|
|
|
public function destroy($id)
|
2023-05-30 14:29:46 +00:00
|
|
|
{
|
2023-05-31 09:35:41 +00:00
|
|
|
$webCrawl = WebCrawl::find($id);
|
2023-05-30 17:19:53 +00:00
|
|
|
if ($webCrawl) {
|
|
|
|
$webCrawl->delete();
|
2023-05-31 09:35:41 +00:00
|
|
|
return response()->json([
|
|
|
|
'message' => 'Web crawl deleted successfully',
|
|
|
|
]);
|
2023-05-30 17:19:53 +00:00
|
|
|
}
|
2023-05-31 09:35:41 +00:00
|
|
|
return response()->json([
|
|
|
|
'error' => 'Web crawl not found',
|
|
|
|
], 404);
|
|
|
|
}
|
|
|
|
|
|
|
|
public function destroyAll()
|
|
|
|
{
|
|
|
|
WebCrawl::truncate();
|
|
|
|
return response()->json([
|
|
|
|
'message' => 'All web crawls deleted successfully',
|
|
|
|
]);
|
2023-05-30 14:29:46 +00:00
|
|
|
}
|
|
|
|
}
|