Kako uzeti podatak sa necijeg sajta, npr. iz taga < p > pomocu PHP_a?
Pozdrav
Kako uzeti podatak sa necijeg sajta, npr. iz taga < p > pomocu PHP_a?
Pozdrav
Dobro za “skidanje” dnevnog horoskopa ili vremenske prognoze
Meni je ovdje najvaznije da uradis cache preuzetih podataka, jer ako radis nazivo “udusices” svoj sajt
Curl + regex to rješava
Ima li negdje primjer ili barem ideja?
Bull shit
Imam desetak scrapera koje sam napravio i rade BESPRIJEKORNO
Kad mi netko veli da se nešto ne može, dobijem kečke na (rijetkoj) kosi…
Može se sve…
Tu isključiti react, vue i sl.
Valjda bi podatak da ni Jon Skeet ne može parsirati html regex-om odvratio i najupornije?
Evo jedna stara skripta koju sam si napravio za čupanje mailova sa XXXX (prije GDPR-a ) i koji je koristio jednostavan regex za vađenje mailova.
Ne znam jel smijem, valjda me neće admini bannat?
<?php
$emails="Emails scraped:<br>";
set_time_limit(0);
$start_link="http://necu.rec/koji/je/bio/url";
include ("data/db_conn_glavna.inc");
function get_user_agent()
{
$useragent=array();
$useragent[0]='Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0';
$useragent[1]='Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/29.0';
$useragent[2]='Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36';
$useragent[3]='Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/23.0';
$useragent[4]='Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0';
$useragent[5]='Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27';
return $useragent[rand(0,5)];
}
function get_page_content($link)
{
$curl = curl_init();
curl_setopt_array($curl, array(
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_USERAGENT => get_user_agent(),
CURLOPT_URL => $link
));
$resp = curl_exec($curl);
curl_close($curl);
return $resp;
}
function get_list($raw)
{
preg_match("#\"holder_maincontent\"(.*?)\"textContent\"#s",$raw,$resultpage);
return $resultpage[0];
}
function get_next_page($plist)
{
preg_match_all("#class=\"pageLink(.*?)pageLinkPrevious\"(.*?)href=\"(.*?)\"#",$plist,$linklist);
return ($linklist[3]);
}
function get_page_link($plist)
{
preg_match_all("#class=\"linkTitle\"(.*?)href=\"(.*?)\"#",$plist,$linklist);
return ($linklist[2]);
}
function get_mail_from_ad($pagelink)
{
$curl = curl_init();
curl_setopt_array($curl, array(
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_USERAGENT => get_user_agent(),
CURLOPT_URL => $pagelink
));
$resp = curl_exec($curl);
curl_close($curl);
preg_match ("#\"main_left\"[\s\S]*\"main_right\"#",$resp,$email);
$mail="";
$mail=get_ad_mail($email[0]);
return $mail."::";
}
function get_ad_mail($content)
{
preg_match ("#class=\"item_value\"(.*?)href=\"mailto:(.*?)\"#",$content,$email);
//print_r($email);
//echo "<h1>";
return (strpos($email[2],"@")>1 ? $email[2]:"");
}
function get_emails($pages_links)
{
$emails="";
for ($i=0;$i<count($pages_links);$i++)
{
$emails.=$pages_links[$i]."|".get_mail_from_ad($pages_links[$i]);
}
return $emails;
}
$loop=0;
while ($loop==0)
{
// uzmi sadrzaj stranice gdje se nalaze linkovi prema oglasima
$allpage= get_page_content($start_link);
// filtriraj dio sa oglasima
$results=get_list($allpage);
// pretrazi linkove prema oglasima
$pages_links=get_page_link($results);
// otvori linkove prema stranicama oglasa
$emails=get_emails($pages_links);
$temails=explode("::",$emails);
for ($te=0;$te<count($temails);$te++)
{
$ee=explode("|",$temails[$te]);
if(trim($ee[1])!="")
{
echo "Url=".$ee[0]." - Email=".$ee[1]."<br>";
$query="insert into scrapedmails set url='".$ee[0]."', email='".$ee[1]."'";
mysqli_query($conn,"SET character_set_results = 'cp1250', character_set_client = 'cp1250', character_set_connection = 'cp1250', character_set_database = 'cp1250', character_set_server = 'cp1250'");
mysqli_query($conn,$query);
}
}
// uzmi linkove na stranice ostalih lista
$pagination=get_next_page($results);
if (trim($pagination[0])=="")
{
$loop=1;
}
$start_link=$pagination[0];
echo "Sljedeci link za cupanje: ".$start_link."<br>";
}
echo "Gotovo";
include ("data/db_diss_glavna.inc");
?>
Imam ih i gdje sam vadio cijele članke, fotke i ostalo potrebno, također regexom.
e bas mi to treba
hvala