Web_Crawler_API/routes/api.php

72 lines
2 KiB
PHP
Raw Normal View History

2023-05-30 09:56:38 +00:00
<?php
use Illuminate\Http\Request;
use Illuminate\Support\Facades\Route;
2023-05-30 12:10:17 +00:00
use GuzzleHttp\Client;
use Illuminate\Support\Facades\DB;
use MongoDB\Client as MongoClient;
2023-05-30 09:56:38 +00:00
2023-05-30 12:10:17 +00:00
Route::get('/crawl', function (Request $request) {
$url = $request->input('url');
check_connection_to_mongodb();
if (!$url) {
return response()->json([
'error' => 'Missing required parameter `url`'
], 400);
}
$depth = $request->input('depth', 3); // default depth is 3 if not provided
$visitedUrls = [];
crawlWebsite($url, $depth, $visitedUrls);
2023-05-30 09:56:38 +00:00
});
2023-05-30 12:10:17 +00:00
function check_connection_to_mongodb() {
$connection = new MongoClient();
echo '<pre>';
echo "IT WORKS";
die;
print_r($connection);
die;
}
function crawlWebsite($url, $depth, &$visitedUrls)
{
// Check if URL has already been visited
if (in_array($url, $visitedUrls)) {
return;
}
$visitedUrls[] = $url;
// Use GuzzleHttp client to send HTTP requests
$client = new Client();
$response = $client->get($url);
// Check if the HTTP response is successful (status code 2xx)
if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) {
// echo $response->getBody()->getContents();
// Insert page info into the database
// DB::table('pages')->insert([
// 'url' => $url,
// 'content' => $response->getBody()->getContents()
// ]);
// Crawl the links on the page
// if ($depth > 0) {
// $body = $response->getBody()->getContents();
// $dom = new DOMDocument();
// @$dom->loadHTML($body);
// $links = $dom->getElementsByTagName('a');
// foreach ($links as $link) {
// $href = $link->getAttribute('href');
// if (filter_var($href, FILTER_VALIDATE_URL)) {
// crawlWebsite($href, $depth - 1, $visitedUrls);
// }
// }
// }
}
}