从postgres导出json数据,然后用spark来处理。

可以从postgresql中select出所需要的数据,转成txt文件,然后把文件内容读取进spark。

sql> copy (select info from items) to '/tmp/foo.json';
$ spark-shell
scala> val df = spark.read.json("/tmp/foo.json")
df: org.apache.spark.sql.DataFrame = [distinct_id: string, event: string ... 3 more fields]
scala> df.printSchema()
root
 |-- distinct_id: string (nullable = true)
 |-- event: string (nullable = true)
 |-- ip: long (nullable = true)
 |-- properties: struct (nullable = true)
 |    |-- $browser: string (nullable = true)
 |    |-- $browser_version: double (nullable = true)
 |    |-- $ce_version: long (nullable = true)
 |    |-- $current_url: string (nullable = true)
 |    |-- $device: string (nullable = true)
 |    |-- $el_attr__href: boolean (nullable = true)
 |    |-- $el_text: string (nullable = true)
 |    |-- $elements: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- attr__aria-disabled: string (nullable = true)
 |    |    |    |-- attr__aria-selected: string (nullable = true)
 |    |    |    |-- attr__class: string (nullable = true)
 |    |    |    |-- attr__id: string (nullable = true)
 |    |    |    |-- attr__role: string (nullable = true)
 |    |    |    |-- attr__style: string (nullable = true)
 |    |    |    |-- classes: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |-- nth_child: long (nullable = true)
 |    |    |    |-- nth_of_type: long (nullable = true)
 |    |    |    |-- tag_name: string (nullable = true)
 |    |-- $event_type: string (nullable = true)
 |    |-- $host: string (nullable = true)
 |    |-- $initial_referrer: string (nullable = true)
 |    |-- $initial_referring_domain: string (nullable = true)
 |    |-- $lib: string (nullable = true)
 |    |-- $lib_version: string (nullable = true)
 |    |-- $os: string (nullable = true)
 |    |-- $pathname: string (nullable = true)
 |    |-- $screen_height: long (nullable = true)
 |    |-- $screen_width: long (nullable = true)
 |    |-- $title: string (nullable = true)
 |    |-- distinct_id: string (nullable = true)
 |    |-- yyks_browser: string (nullable = true)
 |    |-- yyks_page: string (nullable = true)
 |    |-- yyks_platform: string (nullable = true)
 |-- timestamp: long (nullable = true)
scala> df.count()
res9: Long = 29

scala>

My Github Page: https://github.com/liweinan

Powered by Jekyll and Theme by solid

If you have any question want to ask or find bugs regarding with my blog posts, please report it here:
https://github.com/liweinan/liweinan.github.io/issues