WtzDehKr

· 6 years ago · Oct 30, 2019, 07:04 PM
1Command to list services: service --status-all
2Command to start services:
3~> service hbase-master restart # On Master host
4~> service hbase-regionserver restart # On all RS hosts
5
6
7
8Remove root password(if required)
9sudo /etc/init.d/mysqld stop
10sudo /etc/init.d/mysqld stop
11sudo mysqld_safe --skip-grant-tables &
12mysql -uroot
13sudo /etc/init.d/mysqld start
14
15
16
17Queries:
181.
19a.
20[cloudera@quickstart bin]$ mysql -u root
21mysql> show databases;
22create database shopping;
23mysql> use shopping;
24
25Create Tables
26mysql>create table customer(
27userId int Primary Key,
28firstName char(50),
29lastName char(50),
30email varchar(100) ,
31mobileNo numeric(10,0),
32addr varchar(255)
33);
34
35mysql>create table orders(
36orderId int primary key,
37userId int,
38dateOfOrder date,
39statusOfOrder char(50),
40Foreign Key (userId) references customer(userId)
41);
42
43mysql>create table product(
44productId int Primary Key,
45productName varchar(100),
46modelNo varchar(50),
47serialNo varchar(50),
48price int,
49manufacturer varchar(50),
50category varchar(50)
51);
52
53mysql>create table productOrder(
54SrNo int auto_increment primary key not null,
55orderId int,
56productId int,
57quantity int,
58Foreign Key (productId) references product(productId),
59Foreign Key (orderId) references orders(orderId)
60);
61Insert Values:
62insert into customer values (001, 'Nihar', 'Abhyankar', 'nihar@gmail.com', 9876543210,'445 Mount
63Eden Road, Mount Eden, Auckland');
64insert into customer values (002, 'Anshul', 'Chaudhary', 'anshul@gmail.com', 9876543212, '21
65Greens Road RD 2 Ruawai 0592');
66insert into customer values (003, 'Prithvi', 'Amin', 'amin@gmail.com', 9876543213, 'Main Highway
67Otaki; 32 Wilson Street');
68insert into customer values (004, 'Parth', 'Tripathy', 'parth@gmail.com', 9876543214, 'PO Box
6939100, Howick');
70insert into customer values (005, 'Neha', 'jain', 'neha@gmail.com',9876543215, '305 - 14th Ave. S.
71Suite 3B');
72
73
74
75insert into product values (111, 'HyperX', 'Fury', '12344343',4000,'HyperX','RAM');
76insert into product values (112, 'Gigabyte GS-A20', 'hero',
77'12355347',6000,'Gigabyte','Motherboard');
78insert into product values (113, 'Intel i5', '8400', '12366346',24000,'Intel','Processor');
79insert into product values (114, 'Intel i7', '8700K', '12774345',34000,'Intel','Processor');
80insert into product values (115, 'Zotac Graphic Card', 'RTX2080', '13254343',55000,'Zotac','GPU');
81insert into product values (116, 'Zotac Graphic Card', 'RTX2060', '13004343',35000,'Zotac','GPU');
82insert into product values (117, 'Corsair', 'Vengance', '78344343',4000,'Corsair','RAM');
83insert into product values (118, 'Asus Tuf', 'Z370', '12378347',12000,'Asus','Motherboard');
84
85
86insert into orders values (1111, 001, '2019-04-15', 'Delivered');
87insert into orders values (1112, 002, '2018-05-25', 'Delivered');
88insert into orders values (1113, 004, '2019-06-30', 'Delivered');
89insert into orders values (1114, 005, '2019-08-01', 'Shipped');
90insert into orders values (1115, 002, '2019-08-08', 'Placed');
91
92
93insert into productOrder(orderId,productId,quantity) values(1111,111,1);
94insert into productOrder(orderId,productId,quantity) values(1111,115,1);
95insert into productOrder(orderId,productId,quantity) values(1111,117,1);
96insert into productOrder(orderId,productId,quantity) values(1112,116,1);
97insert into productOrder(orderId,productId,quantity) values(1113,118,1);
98insert into productOrder(orderId,productId,quantity) values(1114,114,2);
99insert into productOrder(orderId,productId,quantity) values(1114,114,1);
100insert into productOrder(orderId,productId,quantity) values(1115,112,1);
101
102
103Import single table to HDFS:
104[cloudera@quickstart ~]$  sqoop import --table customer --connect jdbc:mysql://localhost/shopping  --username root --target-dir /user/pract
105
106sqoop import --table productOrder --connect jdbc:mysql://localhost/shopping  --username root --target-dir /user/pract -m 1
107
108Import multiple tables to HDFS:
109 [training@localhost ~]$ sqoop import-all-tables  --connect jdbc:mysql://localhost/shopping  --username root   --warehouse-dir /user/pract
110
111View data:
112[training@localhost ~]$ hdfs dfs -cat /user/cloudera/customer/part-m-*
113
114Import multiple tables to HIVE:
115sqoop import-all-tables  --connect jdbc:mysql://localhost/shopping  --username root --hive-import
116
117sqoop import \ --connect "jdbc:mysql://localhost/training" \ --username training -P \ --table cityByCountry \ --target-dir /user/where_clause \ --where "state = 'Alaska'" \--import -hive  -m 1  
118
119List Tables:
120[training@localhost ~]$ sqoop list-tables  --connect jdbc:mysql://localhost/reyDB  --username root 
121List databases:
122[training@localhost ~]$ sqoop list-databases --connect "jdbc:mysql://localhost" --username root --password cloudera  
123Export Data:
124[training@localhost ~]$ sqoop export  --connect jdbc:mysql://localhost/shopping  --username root  --export-dir=/user/hive/warehouse/temp  --table temp
125
126
127
128
129
130
131Display Product name and model ordered by user with userId =002
132select c.firstName, c.lastName, p.productName
133from customer c, product p, orders o, productOrder po
134where c.userId=2 and c.userId=o.userId and o.orderId=po.orderId and po.productId=p.productId;
135
136
137Display Customer name and ID and the number of orders placed
138create table temp as
139select  c.userId,c.firstName, c.lastName, count(o.orderId)
140from customer c, orders o
141where c.userId=o.userId
142group by c.userId, c.firstName, c.lastName;
143
144
145hive> describe formatted temp;
146This command provides the path
147Path:   /user/hive/warehouse/temp
148
149Create Mysql table
150mysql> create table temp(id int primary key, first_name varchar(20), last_name varchar(20), number long);
151
152
153Export to mysql:
154sqoop export --connect jdbc:mysql://localhost/shopping -m 1 --table temp --export-dir /user/hive/warehouse/temp --input-fields-terminated-by '|' --input-lines-terminated-by '#' --fields-terminated-by '|' --lines-terminated-by '#' --username root
155
156
157 
158 
159 
160 
161 
162INSERT OVERWRITE LOCAL DIRECTORY '/home/cloudera/temp' ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
163select customer.userId,customer.firstname,count(orders.orderid) from customer,orders where customer.userid=orders.userid group by customer.userid,customer.firstname;
164
165Exit from hive and Go to mysql and create table output1 based on output of above query;
166Here, 
167create table output1( customer_id int primary key,customer_name varchar(50),count_orders int);
168
169load data infile '/home/cloudera/temp/000000_0' into table output1 columns terminated by ',';
170If above query dont work
171Try
172load data local infile '/home/cloudera/temp/000000_0' into table output1 columns terminated by ',';
173
174
175
176
177
178
179
180
181
182
183
184
185
186HBASE
187
188hbase(main):006:0> list
189
190
191create - Creates a table.
192list - Lists all the tables in HBase.
193disable - Disables a table.
194is_disabled - Verifies whether a table is disabled.
195enable - Enables a table.
196is_enabled - Verifies whether a table is enabled.
197describe - Provides the description of a table.
198alter - Alters a table.
199alter 't1', NAME ⇒ 'f1', VERSIONS ⇒ 5
200alter ‘ table name ’, ‘delete’ ⇒ ‘ column family ’
201exists - Verifies whether a table exists.
202drop - Drops a table from HBase.
203drop_all - Drops the tables matching the ‘regex’ given in the command.
204    ./bin/stop-hbase.sh
205
206put - Puts a cell value at a specified column in a specified row in a particular table.
207get - Fetches the contents of row or a cell.
208delete - Deletes a cell value in a table.
209deleteall - Deletes all the cells in a given row.
210scan - Scans and returns the table data.
211count - Counts and returns the number of rows in a table.
212truncate - Disables, drops, and recreates a specified table
213hbase(main):032:0> create 'company', 'dept', 'project', 'dependent'
214hbase(main):034:0> put 'company', 'C1', 'dept:no', '1'
2150 row(s) in 0.2770 seconds
216hbase(main):035:0> put 'company', 'C1','dept:name', 'D1'
217hbase(main):043:0> scan 'company'
218hbase(main):043:0> scan 'company', {COLUMNS=>'dept:name'}
219 
220hbase(main):031:0> create 'department',{'NAME'=>'name'}
2210 row(s) in 2.3140 seconds
222
223=> Hbase::Table - department
224
225hbase(main):033:0> put 'department','1','name:dname','CMPN'
2260 row(s) in 0.0270 seconds
227hbase(main):034:0> put 'department','2','name:dname','IT'
2280 row(s) in 0.0110 seconds
229hbase(main):035:0> create 'project',{'NAME'=>'details'}
2300 row(s) in 1.2450 seconds
231
232=> Hbase::Table - project
233hbase(main):036:0> put 'project','1','details:pname','Inventory'
2340 row(s) in 0.0130 seconds
235hbase(main):037:0> put 'project','1','details:dno','2'
2360 row(s) in 0.0060 seconds
237hbase(main):038:0> put 'project','2','details:pname','Inventory'
2380 row(s) in 0.0060 seconds
239hbase(main):039:0> put 'project','2','details:dno','1'
2400 row(s) in 0.0150 seconds
241hbase(main):040:0> put 'project','3','details:pname','Industrial'
2420 row(s) in 0.0050 seconds
243hbase(main):041:0> put 'project','3','details:dno','2'
2440 row(s) in 0.0060 seconds
245hbase(main):002:0> create 'dependent',{'NAME'=>'details'}
2460 row(s) in 2.5910 seconds
247=> Hbase::Table - dependent
248hbase(main):003:0> put 'dependent',1,'details:name','Varsha'
2490 row(s) in 0.2840 seconds
250hbase(main):004:0> put 'dependent',1,'details:relation','Mother'
2510 row(s) in 0.0070 seconds
252hbase(main):005:0> put 'dependent',2,'details:name','Devidas'
2530 row(s) in 0.0130 seconds
254hbase(main):006:0> put 'dependent',2,'details:relation','Father'
2550 row(s) in 0.0110 seconds
256
257hbase(main):008:0> scan 'department',{COLUMNS=>'name:dname'}
258ROW                   COLUMN+CELL                                               
259 1                    column=name:dname, timestamp=1572250404319, value=CMPN    
260 2                    column=name:dname, timestamp=1572250413830, value=IT      
2612 row(s) in 0.1990 seconds
262
263hbase(main):009:0> count 'project'
2643 row(s) in 0.0830 seconds
265
266=> 3
267hbase(main):010:0> scan 'dependent',{COLUMNS=>['details:name','details:relation']}
268ROW                   COLUMN+CELL                                               
269 1                    column=details:name, timestamp=1572336508036, value=Varsha
270 1                    column=details:relation, timestamp=1572336528477, value=Mo
271                      ther                                                      
272 2                    column=details:name, timestamp=1572336546638, value=Devida
273                      s                                                         
274 2                    column=details:relation, timestamp=1572336560001, value=Fa
275                      ther                                                      
2762 row(s) in 0.0290 seconds
277
278hbase(main):011:0> alter 'dependent',NAME=>'cost'
279Updating all regions with the new schema...
2800/1 regions updated.
2811/1 regions updated.
282Done.
2830 row(s) in 3.2010 seconds
284
285OR(above or below any one query)
286
287hbase(main):014:0> alter 'dependent',NAME=>'cost',VERSIONS=>5
288Updating all regions with the new schema...
2891/1 regions updated.
290Done.
2910 row(s) in 1.9820 seconds
292
293
294
295
296pyspark:
297
298
299Input file words.txt
300Hi
301My
302name
303is
304akshay
305How
306are
307you
308all
309Hi again
310Hello
311
312
313Pyspark frequency of each letter in document:
314>>>rdd2=sc.textFile("file:/home/cloudera/Desktop/words.txt").flatMap(lambda line:line.split())
315>>>rdd3=rdd2.map(lambda word:(word,1)).reduceByKey(lambda a,b:(a+b))
316>>> print(rdd3.collect())
317[(u'again', 1), (u'all', 1), (u'name', 1), (u'How', 1), (u'is', 1), (u'akshay', 1), (u'Hi', 2), (u'are', 1), (u'you', 1), (u'My', 1), (u'Hello', 1)]
318>>> for i in rdd3.collect():
319...     print(i)
320...
321(u'again', 1)
322(u'all', 1)
323(u'name', 1)
324(u'How', 1)
325(u'is', 1)
326(u'akshay', 1)
327(u'Hi', 2)
328(u'are', 1)
329(u'you', 1)
330(u'My', 1)
331(u'Hello', 1)
332>>>
333
334Sorting
335>>> rdd4=rdd2.map(lambda word:(word,1)).reduceByKey(lambda a,b:(a+b))
336>>> rdd5=rdd4.sortBy(lambda a:a[0]).collect()
337>>> print(rdd5)
338[(u'Hello', 1), (u'Hi', 2), (u'How', 1), (u'My', 1), (u'again', 1), (u'akshay', 1), (u'all', 1), (u'are', 1), (u'is', 1), (u'name', 1), (u'you', 1)]
339>>> rdd6=rdd4.sortBy(lambda a:a[1]).collect()
340>>> print(rdd6)
341[(u'again', 1), (u'all', 1), (u'name', 1), (u'How', 1), (u'is', 1), (u'akshay', 1), (u'are', 1), (u'you', 1), (u'My', 1), (u'Hello', 1), (u'Hi', 2)]
342>>>
343
344Words with ‘h’
345>>> rdd1=sc.textFile("file:/home/cloudera/Desktop/words.txt").flatMap(lambda line:line.split())
346>>> rdd2=rdd1.filter(lambda word:word.startswith('H')).collect()
347>>> print(rdd2)
348[u'Hi', u'How', u'Hi', u'Hello']
349>>> print(len(rdd2))
3504
351
3524 letter word
353>>> rdd1=sc.textFile("file:/home/cloudera/Desktop/words.txt").flatMap(lambda line:line.split())
354>>> rdd2=rdd1.filter(lambda word:len(word)==4).collect()
355>>> print(rdd2)
356[u'name']
357>>> print(len(rdd2))
3581
359>>>
360
361
362
363
364Avarage Tempreture:
365Input file tempreture.txt
36623
36734
36843
36923
37034
37132
37226
37327
37428
37542
37629
377
378
379>>> rdd1=sc.textFile("file:/home/cloudera/Desktop/tempreture.txt").flatMap(lambda line:line.split())
380>>> sum=0
381>>> count=0
382>>> for i in rdd1.collect():
383...     sum=sum+int(i)
384...     count=count+1
385...
386>>> print(sum)
387341
388>>> print(count)
38911
390>>> print("average tempreture is",sum/count)
391('average tempreture is', 31)
392
393Pagerank:
394Input file pagerank.txt
395page1 page3
396page2 page1
397page4 page1
398page3 page1
399page4 page2
400page3 page4
401
402>>> def computeContribs(neighbors, rank):
403...     for neighbor in neighbors: yield(neighbor, rank/len(neighbors))
404...
405>>> linkfile="file:/home/cloudera/Desktop/pagerank.txt"
406>>> links = sc.textFile(linkfile).map(lambda line: line.split())\
407...    .map(lambda pages: (pages[0],pages[1]))\
408...    .distinct()\
409...    .groupByKey()\
410...    .persist()
411
41219/10/31 00:19:00 WARN shortcircuit.DomainSocketFactory: The short-circuit local reads feature cannot be used because libhadoop cannot be loaded.
413>>>
414>>> links = sc.textFile(linkfile).map(lambda line: line.split())
415>>> ranks=links.map(lambda (page,neighbors): (page,1.0))
416>>> n = 10
417>>> for x in xrange(n):
418...   contribs=links\
419...     .join(ranks)\
420...     .flatMap(lambda (page,(neighbors,rank)): \
421...        computeContribs(neighbors,rank))
422...   ranks=contribs\
423...     .reduceByKey(lambda v1,v2: v1+v2)\
424...     .map(lambda (page,contrib): \
425...          (page,contrib * 0.85 + 0.15))
426...   print "Iteration ",x
427...   for pair in ranks.take(10): print pair
428...
429
430Iteration  0
431(u'a', 1.8499999999999999)                                                      
432(u'1', 1.0)
433(u'3', 0.32000000000000001)
434(u'e', 1.8499999999999999)
435(u'g', 1.8499999999999999)
436(u'p', 1.8499999999999999)
437(u'2', 0.48999999999999999)
438(u'4', 0.48999999999999999)
439Iteration  1
440Iteration  2
441Iteration  3
442Iteration  4                                                                    
443Iteration  5                                                                    
444Iteration  6                                                                    
445Iteration  7                                                                    
446Iteration  8                                                                    
447Iteration  9
448
449
450
451Clustering:
452
453data=sc.textFile("file:/home/training/Desktop/Admission_Predict.csv")
454import numpy
455from numpy import array
456from pyspark.mllib.clustering import KMeans
457header=data.first()
458header = sc.parallelize([header])
459data = data.subtract(header)
460parseData = data.map(lambda line: array([float(x) for x in line.split(',')])).cache()
461clusters = KMeans.train(parseData,3,maxIterations=15,runs=10,initializationMode='random')
462from math import sqrt
463def error(point):
464center = clusters.centers[clusters.predict(point)]
465return sqrt(sum([x**2 for x in (point - center)]))
466WSSSE = parseData.map(lambda point:error(point)).reduce(lambda x,y:x+y)
467print('Within set sum of squared error = '+str(WSSSE))
468Clusters.centers
469
470Matrix vector:
471
472JOIN
473rdd1=sc.parallelize([('hadoop',4),('map',5)])
474>>> rdd2=sc.parallelize([('hadoop',9),('map',10)])
475>>> rdd1.join(rdd2)
476PythonRDD[298] at RDD at PythonRDD.scala:43
477>>> joined=rdd1.join(rdd2)
478>>> joined.collect()
479[('map', (5, 10)), ('hadoop', (4, 9))]
480>>>