从postgres导出json数据,然后用spark来处理。
可以从postgresql中select出所需要的数据,转成txt文件,然后把文件内容读取进spark。
sql> copy (select info from items) to '/tmp/foo.json';
$ spark-shell
scala> val df = spark.read.json("/tmp/foo.json")
df: org.apache.spark.sql.DataFrame = [distinct_id: string, event: string ... 3 more fields]
scala> df.printSchema()
root
|-- distinct_id: string (nullable = true)
|-- event: string (nullable = true)
|-- ip: long (nullable = true)
|-- properties: struct (nullable = true)
| |-- $browser: string (nullable = true)
| |-- $browser_version: double (nullable = true)
| |-- $ce_version: long (nullable = true)
| |-- $current_url: string (nullable = true)
| |-- $device: string (nullable = true)
| |-- $el_attr__href: boolean (nullable = true)
| |-- $el_text: string (nullable = true)
| |-- $elements: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- attr__aria-disabled: string (nullable = true)
| | | |-- attr__aria-selected: string (nullable = true)
| | | |-- attr__class: string (nullable = true)
| | | |-- attr__id: string (nullable = true)
| | | |-- attr__role: string (nullable = true)
| | | |-- attr__style: string (nullable = true)
| | | |-- classes: array (nullable = true)
| | | | |-- element: string (containsNull = true)
| | | |-- nth_child: long (nullable = true)
| | | |-- nth_of_type: long (nullable = true)
| | | |-- tag_name: string (nullable = true)
| |-- $event_type: string (nullable = true)
| |-- $host: string (nullable = true)
| |-- $initial_referrer: string (nullable = true)
| |-- $initial_referring_domain: string (nullable = true)
| |-- $lib: string (nullable = true)
| |-- $lib_version: string (nullable = true)
| |-- $os: string (nullable = true)
| |-- $pathname: string (nullable = true)
| |-- $screen_height: long (nullable = true)
| |-- $screen_width: long (nullable = true)
| |-- $title: string (nullable = true)
| |-- distinct_id: string (nullable = true)
| |-- yyks_browser: string (nullable = true)
| |-- yyks_page: string (nullable = true)
| |-- yyks_platform: string (nullable = true)
|-- timestamp: long (nullable = true)
scala> df.count()
res9: Long = 29
scala>
- sql - Save PL/pgSQL output from PostgreSQL to a CSV file - Stack Overflow
- Fast CSV and JSON Ingestion in PostgreSQL with COPY
- postgresql - Inserting valid json with copy into postgres table - Stack Overflow
- PostgreSQL: Documentation: 11: 9.15. JSON Functions and Operators
- Create Quick JSON Data Dumps From PostgreSQL
- Is PostgreSQL Your Next JSON Database? - Compose Articles
- Postgres JSON: Unleash the Power of Storing JSON in Postgres