Python
script to get Google Search results
Location
:
https://github.com/sterin501/PythonScripts/blob/master/TorrentScrap/googleSearch.py
Requirements
In
windows torrentSearch.exe can be used
1.
Install json
p
ip
install
json
2.Install
BeautifulSoup4
p
ip
install
BeautifulSoup4
3.Install
requests
pip
instal
l
requests
Option
A : ( Best option)
Scrap
using Google API , is the best option . It provide data in json
format .
1.
Get Google API key
2.
Change the settings for custom search to use entire web
Option
B: Scrap from google.com
This
is not good option , google changes output format every time
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
| #!/bin/python
import json,requests,bs4
session_requests = requests.session()
configJson=json.load(open('config.json'))
googleResultCount=configJson['googleResultCount']
googleKey=configJson['googleKey']
gooelcx=configJson['gooelcx']
googleURLend=configJson['googleURLend']
proxyDict = {
"http" : configJson['proxyserver'] ,
"https":configJson['proxyserver'],
}
def getGoogleAPI(keyword):
pnr_data = {
'q' : keyword,
'googleHost' : 'google.co.in',
'num' : googleResultCount,
'key' : googleKey,
'cx' : gooelcx
}
url="https://www.googleapis.com/customsearch/v1"
result = session_requests.get(url,params=pnr_data,proxies=proxyDict)
results = json.loads(result.content)
data = results['items']
URLS=[]
for kk in data:
print kk['link']
URLS.append(kk['link'])
return URLS
def getFrommGoogleCOM(keyword):
pnr_data = {
'q' : keyword,
'gws_rd' : "cr"
}
url="https://www.google.co.in/search"
result = session_requests.get(url,params=pnr_data,proxies=proxyDict)
#print result.content
soup = bs4.BeautifulSoup(result.content,"lxml")
#with open ("result.html", "r") as myfile:
# LKD = myfile.read()
#soup = bs4.BeautifulSoup(LKD,"lxml")
hrf=soup.find_all('a', href=True)
URLS=[]
for kk in hrf:
url1 = kk['href']
if url1.startswith("/url?q="):
url2=url1.split("http")[-1]
url3=url2.split(googleURLend)[0]
URLS.append("http"+url3)
print URLS
if __name__ == '__main__':
URLS=getGoogleAPI("malayalam")
print URLS
getFrommGoogleCOM("Ajith lv")
|