re-order the code, added Service provider + Server it self + Request for WebCrawlRequest

This commit is contained in:
Kfir Dayan 2023-06-01 08:33:46 +03:00
parent 9ac63c8db0
commit 231cd17902
4 changed files with 196 additions and 157 deletions

View file

@ -2,158 +2,39 @@
namespace App\Http\Controllers;
use App\Models\WebCrawl;
use GuzzleHttp\Client;
use Illuminate\Http\Request;
use Illuminate\Support\Facades\Log;
use App\Http\Requests\WebCrawlRequest;
use App\Services\WebCrawlerService;
class WebCrawlController extends Controller
{
protected $webClient;
protected $webCrawlerService;
public function __construct(WebCrawlerService $webCrawlerService)
{
$this->webCrawlerService = $webCrawlerService;
}
public function index()
{
$allCrawls = WebCrawl::all();
return response()->json($allCrawls);
return $this->webCrawlerService->getAllCrawls();
}
public function crawlWebsite(Request $request)
public function crawlWebsite(WebCrawlRequest $request)
{
$url = $request->query('url');
$depth = $request->query('depth', 0);
$refresh = $request->query('refresh', false);
// Check if the URL is already in the database
$webCrawl = WebCrawl::where('url', $url)->first();
if ($webCrawl && !$refresh) {
Log::error("This URL already exists in the database $url");
return response()->json([
'error' => 'This URL already exists in the database',
], 400);
}
$this->webClient = new Client();
// Use GuzzleHttp client to send HTTP requests
$response = $this->scan($url);
if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) {
$crawler = $webCrawl ?? new WebCrawl();
$crawler->url = $url;
$crawler->content = $response->getBody()->getContents();
$linksFromPage = $this->getLinksFromPage($crawler->content);
try {
$crawler->save();
} catch (\Exception $e) {
Log::error($e->getMessage());
return response()->json([
'error' => 'Failed to save the URL to the database',
], 500);
}
if ($depth > 0 && count($linksFromPage) > 0) {
$results = [];
foreach ($linksFromPage as $link) {
$results[] = $this->crawlWebsiteRecursive($link, $depth - 1);
}
return response()->json([
'message' => 'Crawling completed successfully',
'data' => $url
]);
}
} else {
Log::error("Failed to retrieve the URL $url");
return response()->json([
'error' => 'Failed to retrieve the URL',
], 500);
}
Log::info("Crawling completed successfully For URL $url");
return response()->json([
'message' => 'Crawling completed successfully',
]);
}
protected function crawlWebsiteRecursive($url, $depth)
{
// Check if the URL is already in the database
$webCrawl = WebCrawl::where('url', $url)->first();
if ($webCrawl) {
return [];
}
// Use GuzzleHttp client to send HTTP requests
$response = $this->scan($url);
if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) {
$crawler = new WebCrawl();
$crawler->url = $url;
$crawler->content = $response->getBody()->getContents();
$linksFromPage = $this->getLinksFromPage($crawler->content);
try {
$crawler->save();
Log::info("URL saved to the database $url");
} catch (\Exception $e) {
Log::error("Can't save the URL to the database $url");
return [];
}
if ($depth > 0 && count($linksFromPage) > 0) {
$results = [];
foreach ($linksFromPage as $link) {
$results = array_merge($results, $this->crawlWebsiteRecursive($link, $depth - 1));
}
return $results;
}
}
return [];
}
protected function getLinksFromPage($crawlerContent)
{
$dom = new \DOMDocument();
@$dom->loadHTML($crawlerContent);
$links = $dom->getElementsByTagName('a');
$linksFromPage = [];
foreach ($links as $link) {
if ($link->getAttribute('href') && filter_var($link->getAttribute('href'), FILTER_VALIDATE_URL)) {
$linksFromPage[] = $link->getAttribute('href');
}
}
return $linksFromPage;
}
protected function scan($url)
{
try {
$result = $this->webClient->request('GET', $url);
} catch (\Exception $e) {
return response()->json([
'error' => 'Failed to retrieve the URL',
], 500);
}
return $result;
return $this->webCrawlerService->crawlWebsite($url, $depth, $refresh);
}
public function destroy($id)
{
$webCrawl = WebCrawl::find($id);
if ($webCrawl) {
$webCrawl->delete();
Log::info("Web crawl deleted successfully For ID $id");
return response()->json([
'message' => 'Web crawl deleted successfully',
]);
}
Log::error("Web crawl not found For ID $id");
return response()->json([
'error' => 'Web crawl not found',
], 404);
return $this->webCrawlerService->deleteCrawl($id);
}
public function destroyAll()
{
WebCrawl::truncate();
Log::info("All web crawls deleted successfully");
return response()->json([
'message' => 'All web crawls deleted successfully',
]);
return $this->webCrawlerService->deleteAllCrawls();
}
}

View file

@ -0,0 +1,23 @@
<?php
namespace App\Http\Requests;
use Illuminate\Foundation\Http\FormRequest;
class WebCrawlRequest extends FormRequest
{
public function authorize()
{
// Add authorization logic here if needed
return true;
}
public function rules()
{
return [
'url' => 'required|url',
'depth' => 'integer|min:0',
'refresh' => 'integer',
];
}
}

View file

@ -2,34 +2,15 @@
namespace App\Providers;
use App\Services\WebCrawlerService;
use Illuminate\Support\ServiceProvider;
class CrawlerServiceProvider extends ServiceProvider
class WebCrawlServiceProvider extends ServiceProvider
{
/**
* Register services.
*
* @return void
*/
public function register()
{
//
$this->app->singleton(WebCrawlerService::class, function () {
return new WebCrawlerService();
});
}
public function crawlWebsite($url, $depth) {
$visitedUrls = [];
echo 'HERE!';die;
}
/**
* Bootstrap services.
*
* @return void
*/
public function boot()
{
//
}
}

View file

@ -0,0 +1,154 @@
<?php
namespace App\Services;
use App\Models\WebCrawl;
use GuzzleHttp\Client;
use Illuminate\Support\Facades\Log;
class WebCrawlerService
{
protected $webClient;
public function __construct()
{
$this->webClient = new Client();
}
public function getAllCrawls()
{
$allCrawls = WebCrawl::all();
return response()->json($allCrawls);
}
public function crawlWebsite($url, $depth, $refresh)
{
// Check if the URL is already in the database
$webCrawl = WebCrawl::where('url', $url)->first();
if ($webCrawl && !$refresh) {
Log::error("This URL already exists in the database $url");
return response()->json([
'error' => 'This URL already exists in the database',
], 400);
}
$response = $this->scan($url);
if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) {
$crawler = $webCrawl ?? new WebCrawl();
$crawler->url = $url;
$crawler->content = $response->getBody()->getContents();
$linksFromPage = $this->getLinksFromPage($crawler->content);
try {
$crawler->save();
} catch (\Exception $e) {
Log::error($e->getMessage());
return response()->json([
'error' => 'Failed to save the URL to the database',
], 500);
}
if ($depth > 0 && count($linksFromPage) > 0) {
$results = [];
foreach ($linksFromPage as $link) {
$results[] = $this->crawlWebsiteRecursive($link, $depth - 1);
}
return response()->json([
'message' => 'Crawling completed successfully',
'data' => $url
]);
}
} else {
Log::error("Failed to retrieve the URL $url");
return response()->json([
'error' => 'Failed to retrieve the URL',
], 500);
}
Log::info("Crawling completed successfully For URL $url");
return response()->json([
'message' => 'Crawling completed successfully',
]);
}
public function deleteCrawl($id)
{
$webCrawl = WebCrawl::find($id);
if ($webCrawl) {
$webCrawl->delete();
Log::info("Web crawl deleted successfully For ID $id");
return response()->json([
'message' => 'Web crawl deleted successfully',
]);
}
Log::error("Web crawl not found For ID $id");
return response()->json([
'error' => 'Web crawl not found',
], 404);
}
public function deleteAllCrawls()
{
WebCrawl::truncate();
Log::info("All web crawls deleted successfully");
return response()->json([
'message' => 'All web crawls deleted successfully',
]);
}
protected function crawlWebsiteRecursive($url, $depth)
{
$webCrawl = WebCrawl::where('url', $url)->first();
if ($webCrawl) {
return [];
}
$response = $this->scan($url);
if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) {
$crawler = new WebCrawl();
$crawler->url = $url;
$crawler->content = $response->getBody()->getContents();
$linksFromPage = $this->getLinksFromPage($crawler->content);
try {
$crawler->save();
Log::info("URL saved to the database $url");
} catch (\Exception $e) {
Log::error("Can't save the URL to the database $url");
return [];
}
if ($depth > 0 && count($linksFromPage) > 0) {
$results = [];
foreach ($linksFromPage as $link) {
$results = array_merge($results, $this->crawlWebsiteRecursive($link, $depth - 1));
}
return $results;
}
}
return [];
}
protected function getLinksFromPage($crawlerContent)
{
$dom = new \DOMDocument();
@$dom->loadHTML($crawlerContent);
$links = $dom->getElementsByTagName('a');
$linksFromPage = [];
foreach ($links as $link) {
if ($link->getAttribute('href') && filter_var($link->getAttribute('href'), FILTER_VALIDATE_URL)) {
$linksFromPage[] = $link->getAttribute('href');
}
}
return $linksFromPage;
}
protected function scan($url)
{
try {
$result = $this->webClient->request('GET', $url);
} catch (\Exception $e) {
return response()->json([
'error' => 'Failed to retrieve the URL',
], 500);
}
return $result;
}
}