2023-05-30 09:56:38 +00:00
|
|
|
<?php
|
|
|
|
|
|
|
|
use Illuminate\Http\Request;
|
|
|
|
use Illuminate\Support\Facades\Route;
|
2023-05-30 12:10:17 +00:00
|
|
|
use GuzzleHttp\Client;
|
|
|
|
use Illuminate\Support\Facades\DB;
|
|
|
|
use MongoDB\Client as MongoClient;
|
2023-05-30 09:56:38 +00:00
|
|
|
|
2023-05-30 12:10:17 +00:00
|
|
|
Route::get('/crawl', function (Request $request) {
|
|
|
|
|
|
|
|
$url = $request->input('url');
|
|
|
|
check_connection_to_mongodb();
|
|
|
|
|
|
|
|
if (!$url) {
|
|
|
|
return response()->json([
|
|
|
|
'error' => 'Missing required parameter `url`'
|
|
|
|
], 400);
|
|
|
|
}
|
|
|
|
$depth = $request->input('depth', 3); // default depth is 3 if not provided
|
|
|
|
$visitedUrls = [];
|
|
|
|
crawlWebsite($url, $depth, $visitedUrls);
|
2023-05-30 09:56:38 +00:00
|
|
|
});
|
2023-05-30 12:10:17 +00:00
|
|
|
|
|
|
|
|
|
|
|
function check_connection_to_mongodb() {
|
|
|
|
$connection = new MongoClient();
|
|
|
|
echo '<pre>';
|
|
|
|
echo "IT WORKS";
|
|
|
|
die;
|
|
|
|
print_r($connection);
|
|
|
|
die;
|
|
|
|
}
|
|
|
|
function crawlWebsite($url, $depth, &$visitedUrls)
|
|
|
|
{
|
|
|
|
// Check if URL has already been visited
|
|
|
|
if (in_array($url, $visitedUrls)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
$visitedUrls[] = $url;
|
|
|
|
|
|
|
|
// Use GuzzleHttp client to send HTTP requests
|
|
|
|
$client = new Client();
|
|
|
|
$response = $client->get($url);
|
|
|
|
|
|
|
|
// Check if the HTTP response is successful (status code 2xx)
|
|
|
|
if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) {
|
|
|
|
|
|
|
|
// echo $response->getBody()->getContents();
|
|
|
|
|
|
|
|
// Insert page info into the database
|
|
|
|
// DB::table('pages')->insert([
|
|
|
|
// 'url' => $url,
|
|
|
|
// 'content' => $response->getBody()->getContents()
|
|
|
|
// ]);
|
|
|
|
// Crawl the links on the page
|
|
|
|
// if ($depth > 0) {
|
|
|
|
// $body = $response->getBody()->getContents();
|
|
|
|
// $dom = new DOMDocument();
|
|
|
|
// @$dom->loadHTML($body);
|
|
|
|
|
|
|
|
// $links = $dom->getElementsByTagName('a');
|
|
|
|
// foreach ($links as $link) {
|
|
|
|
// $href = $link->getAttribute('href');
|
|
|
|
// if (filter_var($href, FILTER_VALIDATE_URL)) {
|
|
|
|
// crawlWebsite($href, $depth - 1, $visitedUrls);
|
|
|
|
// }
|
|
|
|
// }
|
|
|
|
// }
|
|
|
|
}
|
|
|
|
}
|