2018-05-20

spark 提交任务 python pyspark

spark-submit

1	spark-submit --master yarn --deploy-mode cluster --queue q1 --num-executors 1 scripy.py

pyspark

def process(rows):
    content = ""
    for row in rows:
        content += b64encode(row.url)
    return [content]
     
conf = SparkConf().setAppName('PoliceHive2Xml')
spark_context = SparkContext(conf=conf)
hive_context = HiveContext(spark_context)
sql = "select * from table where dayno=20170807 limit 1000"
data_frame = hive_context.sql(sql)
hdfs_filepath = get_hdfs_filepath(table_name, zip_file_name)
data_frame.mapPartitions(process).saveAsTextFile(hdfs_filepath)