· 4 years ago · Apr 19, 2021, 06:00 PM
1# Semantic Keyword Clustering by Pemavor.com
2# Author: Stefan Neefischer (stefan.neefischer@gmail.com)
3
4from googleapiclient.discovery import build
5import pandas as pd
6import Levenshtein
7from datetime import datetime
8from fuzzywuzzy import fuzz
9from urllib.parse import urlparse
10from tld import get_tld
11import langid
12import json
13import pandas as pd
14import numpy as np
15import networkx as nx
16import community
17import sqlite3
18import math
19import io
20from collections import defaultdict
21
22def cluster_return(searchTerm,partition):
23return partition[searchTerm]
24
25def language_detection(str_lan):
26lan=langid.classify(str_lan)
27return lan[0]
28
29def extract_domain(url, remove_http=True):
30uri = urlparse(url)
31if remove_http:
32domain_name = f"{uri.netloc}"
33else:
34domain_name = f"{uri.netloc}://{uri.netloc}"
35return domain_name
36
37def extract_mainDomain(url):
38res = get_tld(url, as_object=True)
39return res.fld
40
41def fuzzy_ratio(str1,str2):
42return fuzz.ratio(str1,str2)
43
44def fuzzy_token_set_ratio(str1,str2):
45return fuzz.token_set_ratio(str1,str2)
46
47def google_search(search_term, api_key, cse_id,hl,gl, **kwargs):
48try:
49service = build("customsearch", "v1", developerKey=api_key,cache_discovery=False)
50res = service.cse().list(q=search_term,hl=hl,gl=gl,fields='queries(request(totalResults,searchTerms,hl,gl)),items(title,displayLink,link,snippet)',num=10, cx=cse_id, **kwargs).execute()
51return res
52except Exception as e:
53print(e)
54return(e)
55
56def google_search_default_language(search_term, api_key, cse_id,gl, **kwargs):
57try:
58service = build("customsearch", "v1", developerKey=api_key,cache_discovery=False)
59res = service.cse().list(q=search_term,gl=gl,fields='queries(request(totalResults,searchTerms,hl,gl)),items(title,displayLink,link,snippet)',num=10, cx=cse_id, **kwargs).execute()
60return res
61except Exception as e:
62print(e)
63return(e)
64
65def getCluster(DATABASE,SERP_TABLE,CLUSTER_TABLE,TIMESTAMP="max"):
66dateTimeObj = datetime.now()
67connection = sqlite3.connect(DATABASE)
68if TIMESTAMP=="max":
69df = pd.read_sql(f'select * from {SERP_TABLE} where requestTimestamp=(select max(requestTimestamp) from {SERP_TABLE})', connection)
70else:
71df = pd.read_sql(f'select * from {SERP_TABLE} where requestTimestamp="{TIMESTAMP}"', connection)
72G = nx.Graph()
73#add graph nodes from dataframe columun
74G.add_nodes_from(df['searchTerms'])
75#add edges between graph nodes:
76for index, row in df.iterrows():
77df_link=df[df["link"]==row["link"]]
78for index1, row1 in df_link.iterrows():
79G.add_edge(row["searchTerms"], row1['searchTerms'])
80
81# compute the best partition for community (clusters)
82partition = community.best_partition(G)
83
84cluster_df=pd.DataFrame(columns=["cluster","searchTerms"])
85cluster_df["searchTerms"]=list(df["searchTerms"].unique())
86cluster_df["cluster"]=cluster_df.apply(lambda row: cluster_return(row["searchTerms"],partition), axis=1)
87aggregations = defaultdict()
88aggregations["searchTerms"]=' | '.join
89clusters_grouped=cluster_df.groupby("cluster").agg(aggregations).reset_index()
90clusters_grouped["requestTimestamp"]=dateTimeObj
91clusters_grouped=clusters_grouped[["requestTimestamp","cluster","searchTerms"]]
92#save to sqlite cluster table
93connection = sqlite3.connect(DATABASE)
94clusters_grouped.to_sql(name=CLUSTER_TABLE,index=False,if_exists="append",dtype={"requestTimestamp": "DateTime"}, con=connection)
95
96def getSearchResult(filename,hl,gl,my_api_key,my_cse_id,DATABASE,TABLE):
97dateTimeObj = datetime.now()
98rows_to_insert=[]
99keyword_df=pd.read_csv(filename)
100keywords=keyword_df.iloc[:,0].tolist()
101for query in keywords:
102if hl=="default":
103result = google_search_default_language(query, my_api_key, my_cse_id,gl)
104else:
105result = google_search(query, my_api_key, my_cse_id,hl,gl)
106
107if "items" in result and "queries" in result :
108for position in range(0,len(result["items"])):
109result["items"][position]["position"]=position+1
110result["items"][position]["main_domain"]= extract_mainDomain(result["items"][position]["link"])
111result["items"][position]["title_matchScore_token"]=fuzzy_token_set_ratio(result["items"][position]["title"],query)
112result["items"][position]["snippet_matchScore_token"]=fuzzy_token_set_ratio(result["items"][position]["snippet"],query)
113result["items"][position]["title_matchScore_order"]=fuzzy_ratio(result["items"][position]["title"],query)
114result["items"][position]["snippet_matchScore_order"]=fuzzy_ratio(result["items"][position]["snippet"],query)
115result["items"][position]["snipped_language"]=language_detection(result["items"][position]["snippet"])
116
117for position in range(0,len(result["items"])):
118rows_to_insert.append({"requestTimestamp":dateTimeObj,"searchTerms":query,"gl":gl,"hl":hl,
119"totalResults":result["queries"]["request"][0]["totalResults"],"link":result["items"][position]["link"],
120"displayLink":result["items"][position]["displayLink"],"main_domain":result["items"][position]["main_domain"],
121"position":result["items"][position]["position"],"snippet":result["items"][position]["snippet"],
122"snipped_language":result["items"][position]["snipped_language"],"snippet_matchScore_order":result["items"][position]["snippet_matchScore_order"],
123"snippet_matchScore_token":result["items"][position]["snippet_matchScore_token"],"title":result["items"][position]["title"],
124"title_matchScore_order":result["items"][position]["title_matchScore_order"],"title_matchScore_token":result["items"][position]["title_matchScore_token"],
125})
126
127df=pd.DataFrame(rows_to_insert)
128#save serp results to sqlite database
129connection = sqlite3.connect(DATABASE)
130df.to_sql(name=TABLE,index=False,if_exists="append",dtype={"requestTimestamp": "DateTime"}, con=connection)
131
132##############################################################################################################################################
133#Read Me: #
134##############################################################################################################################################
135#1- You need to setup a google custom search engine. #
136# Please Provide the API Key and the SearchId. #
137# Also set your country and language where you want to monitor SERP Results. #
138# If you don't have an API Key and Search Id yet, #
139# you can follow the steps under Prerequisites section in this page https://developers.google.com/custom-search/v1/overview#prerequisites #
140# #
141#2- You need also to enter database, serp table and cluster table names to be used for saving results. #
142# #
143#3- enter csv file name or full path that contains keywords that will be used for serp #
144# #
145#4- For keywords clustering enter the timestamp for serp results that will used for clustering. #
146# If you need to cluster last serp results enter "max" for timestamp. #
147# or you can enter specific timestamp like "2021-02-18 17:18:05.195321" #
148# #
149#5- Browse the results through DB browser for Sqlite program #
150##############################################################################################################################################
151
152#csv file name that have keywords for serp
153CSV_FILE="keywords.csv"
154# determine language
155LANGUAGE = "en"
156#detrmine city
157COUNTRY = "en"
158#google custom search json api key
159API_KEY="ENTER KEY HERE"
160#Search engine ID
161CSE_ID="ENTER ID HERE"
162#sqlite database name
163DATABASE="keywords.db"
164#table name to save serp results to it
165SERP_TABLE="keywords_serps"
166# run serp for keywords
167getSearchResult(CSV_FILE,LANGUAGE,COUNTRY,API_KEY,CSE_ID,DATABASE,SERP_TABLE)
168
169#table name that cluster results will save to it.
170CLUSTER_TABLE="keyword_clusters"
171#Please enter timestamp, if you want to make clusters for specific timestamp
172#If you need to make clusters for the last serp result, send it with "max" value
173#TIMESTAMP="2021-02-18 17:18:05.195321"
174TIMESTAMP="max"
175#run keyword clusters according to networks and community algorithms
176getCluster(DATABASE,SERP_TABLE,CLUSTER_TABLE,TIMESTAMP)