-
Notifications
You must be signed in to change notification settings - Fork 0
/
s3demo.py
62 lines (31 loc) · 974 Bytes
/
s3demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# Databricks notebook source
import pyspark
# COMMAND ----------
csvdf = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("s3://cdb20dw061-flightdata/Flightdata.csv")
# COMMAND ----------
print(csvdf)
# COMMAND ----------
type(csvdf)
# COMMAND ----------
csvdf.printSchema()
# COMMAND ----------
csvdf.count()
# COMMAND ----------
csvdf.take(2)
# COMMAND ----------
csvdf.columns
# COMMAND ----------
len(csvdf.columns)
# COMMAND ----------
csvdf.show()
# COMMAND ----------
csvdf.registerTempTable( "flightcsv" )
# COMMAND ----------
sqlContext.sql('select * from flightcsv limit 100').show(100)
# COMMAND ----------
query1 = sqlContext.sql("select Dest,TailNum,count(tailnum) as Total_count from flightcsv where TailNum != 'NA' and TailNum NOT LIKE '0%' group by Dest, TailNum order by Total_count desc,dest")
# COMMAND ----------
query1.collect()
# COMMAND ----------
query1.show()
# COMMAND ----------