· 6 years ago · May 16, 2019, 03:10 PM
1////// RUN
2// map volumes according to your data directory
3
4docker run -it --rm \
5-e USERID=$UID -e GROUPID=$(id -g) \
6-v /Users/aga/workplace/data/slice/:/data \
7biodatageeks/bdg-sequila:0.5.5-spark-2.4.2-SNAPSHOT \
8 spark-shell --driver-memory=4g \
9 --jars /tmp/bdg-toolset/bdg-sequila-assembly-0.5.5-spark-2.4.2-SNAPSHOT.jar \
10 --conf spark.sql.warehouse.dir=/home/bdgeek/spark-warehouse
11
12
13
14//////// SCRIPT
15
16sc.setLogLevel("WARN")
17
18import org.apache.spark.sql.SequilaSession
19import org.biodatageeks.utils.{SequilaRegister, UDFRegister,BDGInternalParams}
20
21
22val ss = SequilaSession(spark)
23SequilaRegister.register(ss)
24
25ss.sqlContext.setConf("spark.biodatageeks.bam.useGKLInflate","true")
26ss.sqlContext.setConf("spark.biodatageeks.bam.useSparkBAM","false")
27
28ss.sql("CREATE DATABASE IF NOT EXISTS dna")
29ss.sql("USE dna")
30
31
32val bamPath = "/data/NA12878.slice.bam"
33val tableNameBAM = "reads"
34
35 ss.sql(
36 s"""
37 |CREATE TABLE ${tableNameBAM}
38 |USING org.biodatageeks.datasources.BAM.BAMDataSource
39 |OPTIONS(path "${bamPath}")
40 |
41 """.stripMargin)
42
43ss.sql(s"SELECT * FROM bdg_coverage('${tableNameBAM}','NA12878.slice', 'blocks')").show(5)
44
45
46val testPath = "/data/test2.bam"
47val testTable = "reads_test"
48
49
50 ss.sql(
51 s"""
52 |CREATE TABLE ${testTable}
53 |USING org.biodatageeks.datasources.BAM.BAMDataSource
54 |OPTIONS(path "${testPath}")
55 |
56 """.stripMargin)
57
58ss.sql(s"SELECT * FROM bdg_coverage('${testTable}','test2', 'blocks')").show(5)