refactor and finished the routes for : deleteAll, deleteOne, getAll, Crawl

This commit is contained in:
Kfir Dayan 2023-05-31 12:35:41 +03:00
parent cdffb33aff
commit 6a7eb59295
2 changed files with 120 additions and 151 deletions

View file

@ -3,152 +3,131 @@
namespace App\Http\Controllers; namespace App\Http\Controllers;
use App\Models\WebCrawl; use App\Models\WebCrawl;
use Illuminate\Http\Request;
use GuzzleHttp\Client; use GuzzleHttp\Client;
use Illuminate\Http\Request;
class WebCrawlController extends Controller class WebCrawlController extends Controller
{ {
protected $webClient;
protected $webCrawl;
/**
* Display a listing of the resource.
*
* @return \Illuminate\Http\Response
*/
public function index() public function index()
{ {
$allCrawls = WebCrawl::all(); $allCrawls = WebCrawl::all();
//Return the results in JSON format
return response()->json($allCrawls); return response()->json($allCrawls);
} }
public function crawlWebsite($url, $depth) { public function crawlWebsite(Request $request)
// // Use GuzzleHttp client to send HTTP requests
$client = new Client();
$response = $client->get($url);
if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) {
$body = $response->getBody()->getContents();
// get:
// links from the page
// full content
// depth
// url
// visitedUrls
}
// // Check if the HTTP response is successful (status code 2xx)
// // Insert a page info the database if the HTTP response satus is successful
// $webCrawl = new WebCrawl();
// $webCrawl->url = $url;
// $webCrawl->content = $response->getBody()->getContents();
// $webCrawl->save();
// }
// Crawl the links on the page
echo 'Crawling completed!';
}
public function getOne($url)
{ {
$url = $request->query('url');
$depth = $request->query('depth', 1);
// Check if the URL is already in the database
$webCrawl = WebCrawl::where('url', $url)->first(); $webCrawl = WebCrawl::where('url', $url)->first();
echo 'here!';die;
if ($webCrawl) { if ($webCrawl) {
return $webCrawl; return response()->json([
} 'error' => 'This URL already exists in the database',
return false; ], 400);
} }
/** $this->webClient = new Client();
* Show the form for creating a new resource.
* // Use GuzzleHttp client to send HTTP requests
* @return \Illuminate\Http\Response $response = $this->scan($url);
*/ if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) {
private function create($response, $url, $depth, $visitedUrls, $links) $crawler = new WebCrawl();
$crawler->url = $url;
$crawler->content = $response->getBody()->getContents();
$linksFromPage = $this->getLinksFromPage($crawler->content);
try {
$crawler->save();
} catch (\Exception $e) {
return response()->json([
'error' => 'Failed to save the URL to the database',
], 500);
}
if ($depth > 0 && count($linksFromPage) > 0) {
foreach ($linksFromPage as $link) {
$this->crawlWebsiteRecursive($link, $depth - 1);
}
}
} else {
return response()->json([
'error' => 'Failed to retrieve the URL',
], 500);
}
return response()->json([
'message' => 'Crawling completed successfully',
]);
}
protected function crawlWebsiteRecursive($url, $depth)
{ {
$webCrawl = new WebCrawl(); // Check if the URL is already in the database
$webCrawl->url = $url; $webCrawl = WebCrawl::where('url', $url)->first();
$webCrawl->content = $response->getBody()->getContents(); if ($webCrawl) {
$webCrawl->depth = $depth; return;
$webCrawl->visited_urls = $visitedUrls;
$webCrawl->status_code = $response->getStatusCode();
$webCrawl->status = $response->getReasonPhrase();
$webCrawl->created_at = $response->getHeader('Date')[0];
$webCrawl->updated_at = $response->getHeader('Last-Modified')[0];
$webCrawl->links = $links;
$webCrawl->save();
return $webCrawl;
} }
/** // Use GuzzleHttp client to send HTTP requests
* Store a newly created resource in storage. $response = $this->scan($url);
* if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) {
* @param \Illuminate\Http\Request $request $crawler = new WebCrawl();
* @return \Illuminate\Http\Response $crawler->url = $url;
*/ $crawler->content = $response->getBody()->getContents();
public function store(Request $request) $linksFromPage = $this->getLinksFromPage($crawler->content);
try {
$crawler->save();
} catch (\Exception $e) {
return;
}
if ($depth > 0 && count($linksFromPage) > 0) {
foreach ($linksFromPage as $link) {
$this->crawlWebsiteRecursive($link, $depth - 1);
}
}
}
}
protected function getLinksFromPage($crawlerContent)
{ {
// $dom = new \DOMDocument();
@$dom->loadHTML($crawlerContent);
$links = $dom->getElementsByTagName('a');
$linksFromPage = [];
foreach ($links as $link) {
if ($link->getAttribute('href') && filter_var($link->getAttribute('href'), FILTER_VALIDATE_URL)) {
$linksFromPage[] = $link->getAttribute('href');
}
}
return $linksFromPage;
} }
/** protected function scan($url)
* Display the specified resource.
*
* @param \App\Models\WebCrawl $webCrawl
* @return \Illuminate\Http\Response
*/
public function show(WebCrawl $webCrawl)
{ {
// $result = $this->webClient->request('GET', $url);
return $result;
} }
/**
* Show the form for editing the specified resource.
*
* @param \App\Models\WebCrawl $webCrawl
* @return \Illuminate\Http\Response
*/
public function edit(WebCrawl $webCrawl)
{
//
}
/**
* Update the specified resource in storage.
*
* @param \Illuminate\Http\Request $request
* @param \App\Models\WebCrawl $webCrawl
* @return \Illuminate\Http\Response
*/
public function update(Request $request, WebCrawl $webCrawl)
{
//
}
/**
* Remove the specified resource from storage.
*
* @param \App\Models\WebCrawl $webCrawl
* @return \Illuminate\Http\Response
*/
public function destroy($id) public function destroy($id)
{ {
$webCrawl = WebCrawl::where("_id", $id); $webCrawl = WebCrawl::find($id);
echo '<pre>';
echo 'fff';
print_r($webCrawl);die;
if ($webCrawl) { if ($webCrawl) {
$webCrawl->delete(); $webCrawl->delete();
return true; return response()->json([
'message' => 'Web crawl deleted successfully',
]);
} }
return false; return response()->json([
'error' => 'Web crawl not found',
], 404);
}
public function destroyAll()
{
WebCrawl::truncate();
return response()->json([
'message' => 'All web crawls deleted successfully',
]);
} }
} }

View file

@ -2,44 +2,34 @@
use Illuminate\Http\Request; use Illuminate\Http\Request;
use Illuminate\Support\Facades\Route; use Illuminate\Support\Facades\Route;
use GuzzleHttp\Client;
use App\Http\Controllers\WebCrawlController; use App\Http\Controllers\WebCrawlController;
use GuzzleHttp\Psr7\Response;
Route::get('/crawl', function (Request $request) { Route::get('/crawl', function (Request $request) {
// invode WebCrawlController index method in WebCrawlController $url = $request->query('url');
$url = $request->input('url');
// check if the url is valid URL
if (!$url || !filter_var($url, FILTER_VALIDATE_URL)) { if (!$url || !filter_var($url, FILTER_VALIDATE_URL)) {
return response()->json([ return response()->json([
'error' => 'Missing required parameter `url`' 'error' => 'Missing required parameter `url` or invalid URL',
], 400); ], 400);
} }
$depth = $request->input('depth', 3); // default depth is 3 if not provided
$depth = $request->query('depth', 3);
$crawlerController = new WebCrawlController(); $crawlerController = new WebCrawlController();
$isAlreadyDone = $crawlerController->getOne($url); return $crawlerController->crawlWebsite($request);
if(!$isAlreadyDone){
$crawlerController->crawlWebsite($url, $depth);
} else {
return response()->json([
'error' => 'This URL has already been crawled',
'data' => $isAlreadyDone
], 400);
}
}); });
// Route::post('/crawl/{id}', function (String $id, Request $request, Response $response) { Route::get('/', function () {
// $id = $request->input('id'); $crawlerController = new WebCrawlController();
// $crawlerController = new WebCrawlController(); return $crawlerController->index();
// if(!$crawlerController->destroy($id)) { });
// return response()->json([
// 'error' => 'Url Not Found',
// ], 404);
// } else {
// return response()->json([
// 'success' => 'This URL has been deleted',
// ], 200);
// }
// });
Route::delete('/crawl/{id}', function ($id) {
$crawlerController = new WebCrawlController();
return $crawlerController->destroy($id);
});
Route::delete('/crawl', function () {
$crawlerController = new WebCrawlController();
return $crawlerController->destroyAll();
});