Python
script to get Google Search results
Location
:
https://github.com/sterin501/PythonScripts/blob/master/TorrentScrap/googleSearch.py
Requirements
In
windows torrentSearch.exe can be used
1.
Install json
p
ip
install
json
2.Install
BeautifulSoup4
p
ip
install
BeautifulSoup4
3.Install
requests
pip
instal
l
requests
Option
A : ( Best option)
Scrap
using Google API , is the best option . It provide data in json
format .
1.
Get Google API key
2.
Change the settings for custom search to use entire web
Option
B: Scrap from google.com
This
is not good option , google changes output format every time
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | #!/bin/python import json,requests,bs4 session_requests = requests.session() configJson=json.load(open('config.json')) googleResultCount=configJson['googleResultCount'] googleKey=configJson['googleKey'] gooelcx=configJson['gooelcx'] googleURLend=configJson['googleURLend'] proxyDict = { "http" : configJson['proxyserver'] , "https":configJson['proxyserver'], } def getGoogleAPI(keyword): pnr_data = { 'q' : keyword, 'googleHost' : 'google.co.in', 'num' : googleResultCount, 'key' : googleKey, 'cx' : gooelcx } url="https://www.googleapis.com/customsearch/v1" result = session_requests.get(url,params=pnr_data,proxies=proxyDict) results = json.loads(result.content) data = results['items'] URLS=[] for kk in data: print kk['link'] URLS.append(kk['link']) return URLS def getFrommGoogleCOM(keyword): pnr_data = { 'q' : keyword, 'gws_rd' : "cr" } url="https://www.google.co.in/search" result = session_requests.get(url,params=pnr_data,proxies=proxyDict) #print result.content soup = bs4.BeautifulSoup(result.content,"lxml") #with open ("result.html", "r") as myfile: # LKD = myfile.read() #soup = bs4.BeautifulSoup(LKD,"lxml") hrf=soup.find_all('a', href=True) URLS=[] for kk in hrf: url1 = kk['href'] if url1.startswith("/url?q="): url2=url1.split("http")[-1] url3=url2.split(googleURLend)[0] URLS.append("http"+url3) print URLS if __name__ == '__main__': URLS=getGoogleAPI("malayalam") print URLS getFrommGoogleCOM("Ajith lv") |
No comments:
Post a Comment