# Pastebin hSRjvqpl updating: listenbrainz_spark/ (stored 0%) updating: listenbrainz_spark/stats_writer/ (stored 0%) updating: listenbrainz_spark/stats_writer/stats_writer.py (deflated 67%) updating: listenbrainz_spark/stats_writer/__init__.py (stored 0%) updating: listenbrainz_spark/stats_writer/__pycache__/ (stored 0%) updating: listenbrainz_spark/stats_writer/__pycache__/__init__.cpython-34.pyc (deflated 21%) updating: listenbrainz_spark/stats_writer/__pycache__/stats_writer.cpython-34.pyc (deflated 45%) updating: listenbrainz_spark/stats/ (stored 0%) updating: listenbrainz_spark/stats/__init__.py (deflated 35%) updating: listenbrainz_spark/stats/__pycache__/ (stored 0%) updating: listenbrainz_spark/stats/__pycache__/__init__.cpython-34.pyc (deflated 31%) updating: listenbrainz_spark/stats/__pycache__/user.cpython-34.pyc (deflated 62%) updating: listenbrainz_spark/stats/user.py (deflated 75%) updating: listenbrainz_spark/store-data/ (stored 0%) updating: listenbrainz_spark/utils.py (deflated 29%) updating: listenbrainz_spark/config.py (deflated 46%) updating: listenbrainz_spark/constants.py (stored 0%) updating: listenbrainz_spark/config.py.sample (deflated 37%) updating: listenbrainz_spark/RabbitMQ/ (stored 0%) updating: listenbrainz_spark/RabbitMQ/__init__.py (stored 0%) updating: listenbrainz_spark/__init__.py (deflated 58%) updating: listenbrainz_spark/data/ (stored 0%) updating: listenbrainz_spark/data/__init__.py (stored 0%) updating: listenbrainz_spark/data/import_dump.py (deflated 63%) updating: listenbrainz_spark/hdfs_connection.py (deflated 32%) updating: listenbrainz_spark/tables/ (stored 0%) updating: listenbrainz_spark/__pycache__/ (stored 0%) updating: listenbrainz_spark/__pycache__/__init__.cpython-34.pyc (deflated 37%) updating: listenbrainz_spark/__pycache__/config.cpython-34.pyc (deflated 25%) updating: listenbrainz_spark/schema.py (deflated 73%) updating: listenbrainz_spark/__pycache__/hdfs_connection.cpython-34.pyc (deflated 28%) updating: listenbrainz_spark/train_models.py (deflated 65%) updating: listenbrainz_spark/recommend.py (deflated 63%) updating: listenbrainz_spark/__pycache__/recommend.cpython-34.pyc (deflated 48%) updating: listenbrainz_spark/__pycache__/train_models.cpython-34.pyc (deflated 49%) updating: listenbrainz_spark/.recommend.py.swo (deflated 98%) listenbrainz-jobs listenbrainz-jobs latest: Pulling from metabrainz/listenbrainz-spark Digest: sha256:cdf808ecf5693aad742a6cdccc25539a3c0a1c498998aade5328cba521978718 Status: Downloaded newer image for metabrainz/listenbrainz-spark:latest 2019-03-19 16:30:04 WARN SparkConf:66 - The configuration key 'spark.scheduler.listenerbus.eventqueue.size' has been deprecated as of Spark 2.3 and may be removed in the future. Please use the new key 'spark.scheduler.listenerbus.eventqueue.capacity' instead. 2019-03-19 16:30:04 WARN NativeCodeLoader:62 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 2019-03-19 16:30:05 WARN SparkConf:66 - The configuration key 'spark.scheduler.listenerbus.eventqueue.size' has been deprecated as of Spark 2.3 and may be removed in the future. Please use the new key 'spark.scheduler.listenerbus.eventqueue.capacity' instead. 2019-03-19 16:30:07 WARN SparkConf:66 - The configuration key 'spark.scheduler.listenerbus.eventqueue.size' has been deprecated as of Spark 2.3 and may be removed in the future. Please use the new key 'spark.scheduler.listenerbus.eventqueue.capacity' instead. 2019-03-19 16:30:07 INFO SparkContext:54 - Running Spark version 2.3.1 2019-03-19 16:30:07 INFO SparkContext:54 - Submitted application: Create_Dataframe 2019-03-19 16:30:07 INFO SecurityManager:54 - Changing view acls to: root 2019-03-19 16:30:07 INFO SecurityManager:54 - Changing modify acls to: root 2019-03-19 16:30:07 INFO SecurityManager:54 - Changing view acls groups to: 2019-03-19 16:30:07 INFO SecurityManager:54 - Changing modify acls groups to: 2019-03-19 16:30:07 INFO SecurityManager:54 - SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(root); groups with view permissions: Set(); users with modify permissions: Set(root); groups with modify permissions: Set() 2019-03-19 16:30:08 INFO Utils:54 - Successfully started service 'sparkDriver' on port 40515. 2019-03-19 16:30:08 INFO SparkEnv:54 - Registering MapOutputTracker 2019-03-19 16:30:08 INFO SparkEnv:54 - Registering BlockManagerMaster 2019-03-19 16:30:08 INFO BlockManagerMasterEndpoint:54 - Using org.apache.spark.storage.DefaultTopologyMapper for getting topology information 2019-03-19 16:30:08 INFO BlockManagerMasterEndpoint:54 - BlockManagerMasterEndpoint up 2019-03-19 16:30:08 INFO DiskBlockManager:54 - Created local directory at /tmp/blockmgr-d007ab09-452b-41b1-9fd9-1438e6d3da85 2019-03-19 16:30:08 INFO MemoryStore:54 - MemoryStore started with capacity 4.1 GB 2019-03-19 16:30:08 INFO SparkEnv:54 - Registering OutputCommitCoordinator 2019-03-19 16:30:08 INFO log:192 - Logging initialized @6474ms 2019-03-19 16:30:08 INFO Server:346 - jetty-9.3.z-SNAPSHOT 2019-03-19 16:30:08 INFO Server:414 - Started @6662ms 2019-03-19 16:30:08 INFO AbstractConnector:278 - Started ServerConnector@2217ead1{HTTP/1.1,[http/1.1]}{0.0.0.0:4040} 2019-03-19 16:30:08 INFO Utils:54 - Successfully started service 'SparkUI' on port 4040. 2019-03-19 16:30:08 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@1f9e915e{/jobs,null,AVAILABLE,@Spark} 2019-03-19 16:30:08 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@b1d0215{/jobs/json,null,AVAILABLE,@Spark} 2019-03-19 16:30:08 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@6af127a5{/jobs/job,null,AVAILABLE,@Spark} 2019-03-19 16:30:08 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@4d9adcb7{/jobs/job/json,null,AVAILABLE,@Spark} 2019-03-19 16:30:08 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@2e7472bb{/stages,null,AVAILABLE,@Spark} 2019-03-19 16:30:08 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@4ddba4df{/stages/json,null,AVAILABLE,@Spark} 2019-03-19 16:30:08 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@56b63602{/stages/stage,null,AVAILABLE,@Spark} 2019-03-19 16:30:08 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@5053dc28{/stages/stage/json,null,AVAILABLE,@Spark} 2019-03-19 16:30:08 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@3732f606{/stages/pool,null,AVAILABLE,@Spark} 2019-03-19 16:30:08 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@6b0415e8{/stages/pool/json,null,AVAILABLE,@Spark} 2019-03-19 16:30:08 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@58a48ea1{/storage,null,AVAILABLE,@Spark} 2019-03-19 16:30:08 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@16d8ab01{/storage/json,null,AVAILABLE,@Spark} 2019-03-19 16:30:08 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@36fafd1a{/storage/rdd,null,AVAILABLE,@Spark} 2019-03-19 16:30:08 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@7d9a0578{/storage/rdd/json,null,AVAILABLE,@Spark} 2019-03-19 16:30:08 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@1374dec{/environment,null,AVAILABLE,@Spark} 2019-03-19 16:30:08 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@52e47e68{/environment/json,null,AVAILABLE,@Spark} 2019-03-19 16:30:08 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@33175b03{/executors,null,AVAILABLE,@Spark} 2019-03-19 16:30:08 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@1e8b29dc{/executors/json,null,AVAILABLE,@Spark} 2019-03-19 16:30:08 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@25e85fcc{/executors/threadDump,null,AVAILABLE,@Spark} 2019-03-19 16:30:08 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@540e7227{/executors/threadDump/json,null,AVAILABLE,@Spark} 2019-03-19 16:30:08 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@5d48e4e6{/static,null,AVAILABLE,@Spark} 2019-03-19 16:30:08 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@1cce93e7{/,null,AVAILABLE,@Spark} 2019-03-19 16:30:08 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@14bf43c8{/api,null,AVAILABLE,@Spark} 2019-03-19 16:30:08 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@6daa9aa3{/jobs/job/kill,null,AVAILABLE,@Spark} 2019-03-19 16:30:08 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@d21f77b{/stages/stage/kill,null,AVAILABLE,@Spark} 2019-03-19 16:30:08 INFO SparkUI:54 - Bound SparkUI to 0.0.0.0, and started at http://10073a9f139e:4040 2019-03-19 16:30:08 INFO SparkContext:54 - Added file file:/rec/create_dataframes.py at spark://10073a9f139e:40515/files/create_dataframes.py with timestamp 1553013008950 2019-03-19 16:30:08 INFO Utils:54 - Copying /rec/create_dataframes.py to /tmp/spark-e8147460-18a4-416a-a149-6d2a534de387/userFiles-08c7e377-65bc-4cb0-8dcd-caa219f808aa/create_dataframes.py 2019-03-19 16:30:09 INFO SparkContext:54 - Added file file:///rec/listenbrainz_spark.zip at spark://10073a9f139e:40515/files/listenbrainz_spark.zip with timestamp 1553013009000 2019-03-19 16:30:09 INFO Utils:54 - Copying /rec/listenbrainz_spark.zip to /tmp/spark-e8147460-18a4-416a-a149-6d2a534de387/userFiles-08c7e377-65bc-4cb0-8dcd-caa219f808aa/listenbrainz_spark.zip 2019-03-19 16:30:09 INFO StandaloneAppClient$ClientEndpoint:54 - Connecting to master spark://spark-master.spark-network:7077... 2019-03-19 16:30:09 INFO TransportClientFactory:267 - Successfully created connection to spark-master.spark-network/10.0.0.164:7077 after 100 ms (0 ms spent in bootstraps) 2019-03-19 16:30:09 INFO StandaloneSchedulerBackend:54 - Connected to Spark cluster with app ID app-20190319163009-0030 2019-03-19 16:30:09 INFO StandaloneAppClient$ClientEndpoint:54 - Executor added: app-20190319163009-0030/0 on worker-20190317113929-10.0.0.175-43873 (10.0.0.175:43873) with 3 core(s) 2019-03-19 16:30:09 INFO StandaloneSchedulerBackend:54 - Granted executor ID app-20190319163009-0030/0 on hostPort 10.0.0.175:43873 with 3 core(s), 10.0 GB RAM 2019-03-19 16:30:09 INFO StandaloneAppClient$ClientEndpoint:54 - Executor added: app-20190319163009-0030/1 on worker-20190317113929-10.0.0.175-43873 (10.0.0.175:43873) with 3 core(s) 2019-03-19 16:30:09 INFO StandaloneSchedulerBackend:54 - Granted executor ID app-20190319163009-0030/1 on hostPort 10.0.0.175:43873 with 3 core(s), 10.0 GB RAM 2019-03-19 16:30:09 INFO StandaloneAppClient$ClientEndpoint:54 - Executor added: app-20190319163009-0030/2 on worker-20190317113929-10.0.0.176-43619 (10.0.0.176:43619) with 3 core(s) 2019-03-19 16:30:09 INFO StandaloneSchedulerBackend:54 - Granted executor ID app-20190319163009-0030/2 on hostPort 10.0.0.176:43619 with 3 core(s), 10.0 GB RAM 2019-03-19 16:30:09 INFO Utils:54 - Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 36911. 2019-03-19 16:30:09 INFO StandaloneAppClient$ClientEndpoint:54 - Executor added: app-20190319163009-0030/3 on worker-20190317113929-10.0.0.176-43619 (10.0.0.176:43619) with 3 core(s) 2019-03-19 16:30:09 INFO StandaloneSchedulerBackend:54 - Granted executor ID app-20190319163009-0030/3 on hostPort 10.0.0.176:43619 with 3 core(s), 10.0 GB RAM 2019-03-19 16:30:09 INFO NettyBlockTransferService:54 - Server created on 10073a9f139e:36911 2019-03-19 16:30:09 INFO StandaloneAppClient$ClientEndpoint:54 - Executor added: app-20190319163009-0030/4 on worker-20190318185326-10.0.0.202-45335 (10.0.0.202:45335) with 3 core(s) 2019-03-19 16:30:09 INFO StandaloneSchedulerBackend:54 - Granted executor ID app-20190319163009-0030/4 on hostPort 10.0.0.202:45335 with 3 core(s), 10.0 GB RAM 2019-03-19 16:30:09 INFO StandaloneAppClient$ClientEndpoint:54 - Executor added: app-20190319163009-0030/5 on worker-20190318185326-10.0.0.202-45335 (10.0.0.202:45335) with 3 core(s) 2019-03-19 16:30:09 INFO StandaloneSchedulerBackend:54 - Granted executor ID app-20190319163009-0030/5 on hostPort 10.0.0.202:45335 with 3 core(s), 10.0 GB RAM 2019-03-19 16:30:09 INFO BlockManager:54 - Using org.apache.spark.storage.RandomBlockReplicationPolicy for block replication policy 2019-03-19 16:30:09 INFO StandaloneAppClient$ClientEndpoint:54 - Executor updated: app-20190319163009-0030/0 is now RUNNING 2019-03-19 16:30:09 INFO StandaloneAppClient$ClientEndpoint:54 - Executor updated: app-20190319163009-0030/1 is now RUNNING 2019-03-19 16:30:09 INFO StandaloneAppClient$ClientEndpoint:54 - Executor updated: app-20190319163009-0030/4 is now RUNNING 2019-03-19 16:30:09 INFO StandaloneAppClient$ClientEndpoint:54 - Executor updated: app-20190319163009-0030/5 is now RUNNING 2019-03-19 16:30:09 INFO StandaloneAppClient$ClientEndpoint:54 - Executor updated: app-20190319163009-0030/2 is now RUNNING 2019-03-19 16:30:09 INFO StandaloneAppClient$ClientEndpoint:54 - Executor updated: app-20190319163009-0030/3 is now RUNNING 2019-03-19 16:30:09 INFO BlockManagerMaster:54 - Registering BlockManager BlockManagerId(driver, 10073a9f139e, 36911, None) 2019-03-19 16:30:09 INFO BlockManagerMasterEndpoint:54 - Registering block manager 10073a9f139e:36911 with 4.1 GB RAM, BlockManagerId(driver, 10073a9f139e, 36911, None) 2019-03-19 16:30:09 INFO BlockManagerMaster:54 - Registered BlockManager BlockManagerId(driver, 10073a9f139e, 36911, None) 2019-03-19 16:30:09 INFO BlockManager:54 - Initialized BlockManager: BlockManagerId(driver, 10073a9f139e, 36911, None) 2019-03-19 16:30:09 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@4f5466a0{/metrics/json,null,AVAILABLE,@Spark} 2019-03-19 16:30:09 INFO StandaloneSchedulerBackend:54 - SchedulerBackend is ready for scheduling beginning after reached minRegisteredResourcesRatio: 0.0 2019-03-19 16:30:10 INFO SharedState:54 - Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir ('file:/rec/spark-warehouse/'). 2019-03-19 16:30:10 INFO SharedState:54 - Warehouse path is 'file:/rec/spark-warehouse/'. 2019-03-19 16:30:10 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@2031ebb9{/SQL,null,AVAILABLE,@Spark} 2019-03-19 16:30:10 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@30cd458c{/SQL/json,null,AVAILABLE,@Spark} 2019-03-19 16:30:10 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@bf3514a{/SQL/execution,null,AVAILABLE,@Spark} 2019-03-19 16:30:10 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@5a63b8e5{/SQL/execution/json,null,AVAILABLE,@Spark} 2019-03-19 16:30:10 INFO ContextHandler:781 - Started o.s.j.s.ServletContextHandler@67e96c98{/static/sql,null,AVAILABLE,@Spark} 2019-03-19 16:30:11 INFO StateStoreCoordinatorRef:54 - Registered StateStoreCoordinator endpoint listens for 2005.1 not found listens for 2005.2 found listens for 2005.3 found listens for 2005.4 found listens for 2005.5 found listens for 2005.6 found listens for 2005.7 found listens for 2005.8 found listens for 2005.9 found listens for 2005.10 found listens for 2005.11 found listens for 2005.12 found listens for 2006.1 found listens for 2006.2 found listens for 2006.3 found listens for 2006.4 found listens for 2006.5 found listens for 2006.6 found listens for 2006.7 found listens for 2006.8 found listens for 2006.9 found listens for 2006.10 found listens for 2006.11 found listens for 2006.12 found listens for 2007.1 found listens for 2007.2 found listens for 2007.3 found listens for 2007.4 found listens for 2007.5 found listens for 2007.6 found listens for 2007.7 found listens for 2007.8 found listens for 2007.9 found listens for 2007.10 found listens for 2007.11 found listens for 2007.12 found listens for 2008.1 found listens for 2008.2 found listens for 2008.3 found listens for 2008.4 found listens for 2008.5 found listens for 2008.6 found listens for 2008.7 found listens for 2008.8 found listens for 2008.9 found listens for 2008.10 found listens for 2008.11 found listens for 2008.12 found listens for 2009.1 found listens for 2009.2 found listens for 2009.3 found listens for 2009.4 found listens for 2009.5 found listens for 2009.6 found listens for 2009.7 found listens for 2009.8 found listens for 2009.9 found listens for 2009.10 found listens for 2009.11 found listens for 2009.12 found listens for 2010.1 found listens for 2010.2 found listens for 2010.3 found listens for 2010.4 found listens for 2010.5 found listens for 2010.6 found listens for 2010.7 found listens for 2010.8 found listens for 2010.9 found listens for 2010.10 found listens for 2010.11 found listens for 2010.12 found listens for 2011.1 found listens for 2011.2 found listens for 2011.3 found listens for 2011.4 found listens for 2011.5 found listens for 2011.6 found listens for 2011.7 found listens for 2011.8 found listens for 2011.9 found listens for 2011.10 found listens for 2011.11 found listens for 2011.12 found listens for 2012.1 found listens for 2012.2 found listens for 2012.3 found listens for 2012.4 found listens for 2012.5 found listens for 2012.6 found listens for 2012.7 found listens for 2012.8 found listens for 2012.9 found listens for 2012.10 found listens for 2012.11 found listens for 2012.12 found listens for 2013.1 found listens for 2013.2 found listens for 2013.3 found listens for 2013.4 found listens for 2013.5 found listens for 2013.6 found listens for 2013.7 found listens for 2013.8 found listens for 2013.9 found listens for 2013.10 found listens for 2013.11 found listens for 2013.12 found listens for 2014.1 found listens for 2014.2 found listens for 2014.3 found listens for 2014.4 found listens for 2014.5 found listens for 2014.6 found listens for 2014.7 found listens for 2014.8 found listens for 2014.9 found listens for 2014.10 found listens for 2014.11 found listens for 2014.12 found listens for 2015.1 found listens for 2015.2 found listens for 2015.3 found listens for 2015.4 found listens for 2015.5 found listens for 2015.6 found listens for 2015.7 found listens for 2015.8 found listens for 2015.9 found listens for 2015.10 found listens for 2015.11 found listens for 2015.12 found listens for 2016.1 found listens for 2016.2 found listens for 2016.3 found listens for 2016.4 found listens for 2016.5 found listens for 2016.6 found listens for 2016.7 found listens for 2016.8 found listens for 2016.9 found listens for 2016.10 found listens for 2016.11 found listens for 2016.12 found listens for 2017.1 found listens for 2017.2 found listens for 2017.3 found listens for 2017.4 found listens for 2017.5 found listens for 2017.6 found listens for 2017.7 found listens for 2017.8 found listens for 2017.9 found listens for 2017.10 found listens for 2017.11 found listens for 2017.12 found listens for 2018.1 found listens for 2018.2 found listens for 2018.3 found listens for 2018.4 found listens for 2018.5 found listens for 2018.6 found listens for 2018.7 found listens for 2018.8 found listens for 2018.9 found listens for 2018.10 found listens for 2018.11 found listens for 2018.12 found listens for 2019.1 found listens for 2019.2 not found listens for 2019.3 not found listens for 2019.4 not found listens for 2019.5 not found listens for 2019.6 not found listens for 2019.7 not found listens for 2019.8 not found listens for 2019.9 not found listens for 2019.10 not found listens for 2019.11 not found listens for 2019.12 not found root |-- artist_mbids: array (nullable = true) | |-- element: string (containsNull = true) |-- artist_msid: string (nullable = true) |-- artist_name: string (nullable = true) |-- listened_at: timestamp (nullable = true) |-- recording_mbid: string (nullable = true) |-- recording_msid: string (nullable = true) |-- release_mbid: string (nullable = true) |-- release_msid: string (nullable = true) |-- release_name: string (nullable = true) |-- tags: array (nullable = true) | |-- element: string (containsNull = true) |-- track_name: string (nullable = true) |-- user_name: string (nullable = true) Registering Dataframe... Number of rows in dataframe: 200272949 Dataframe loaded in : 102.89266967773438 Number of rows in users df: 3917 Users data prepared in 37.58564829826355 Load data dump... Number of rows in listens df: 200272949 Listens data prepared in 16.5418803691864 Prepare recording dump... Number of rows in recording df: 22292804 Recording data prepared in 62.08350348472595 Get playcounts... Time taken in registerinf dataframes 0.1789093017578125 Number of rows in intermediate df: 55535172 Playcount data prepared in 262.7443850040436 +-------+------------+-----+ |user_id|recording_id|count| +-------+------------+-----+ | 1| 10707001| 1| | 1| 3413148| 1| | 1| 17722609| 2| | 1| 8150900| 1| | 1| 19959587| 1| | 1| 6688408| 14| | 1| 12503057| 1| | 1| 9407100| 1| | 1| 5302271| 3| | 1| 12386096| 1| | 1| 21586749| 2| | 1| 228476| 1| | 1| 7011256| 1| | 1| 17384914| 2| | 1| 5583359| 1| | 1| 7139390| 1| | 1| 14600172| 11| | 1| 3745992| 1| | 1| 15601179| 1| | 1| 19323026| 5| +-------+------------+-----+ only showing top 20 rows None Playcount df splitted in: 501.0833387374878 2019-03-19 16:46:44 ERROR TaskSetManager:70 - Task 8 in stage 230.0 failed 4 times; aborting job Traceback (most recent call last): File "/rec/create_dataframes.py", line 101, in train_models.main(playcounts_df) File "/rec/listenbrainz_spark/train_models.py", line 67, in main num_training = training_data.count() File "/usr/local/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 1073, in count File "/usr/local/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 1064, in sum File "/usr/local/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 935, in fold File "/usr/local/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 834, in collect File "/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__ File "/usr/local/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 63, in deco File "/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe. : org.apache.spark.SparkException: Job aborted due to stage failure: Task 8 in stage 230.0 failed 4 times, most recent failure: Lost task 8.3 in stage 230.0 (TID 51145, 10.0.0.176, executor 2): org.apache.spark.api.python.PythonException: Traceback (most recent call last): File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 217, in main func, profiler, deserializer, serializer = read_command(pickleSer, infile) File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 59, in read_command command = serializer._read_with_length(file) File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 170, in _read_with_length return self.loads(obj) File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 559, in loads return pickle.loads(obj, encoding=encoding) File "./listenbrainz_spark.zip/listenbrainz_spark/train_models.py", line 11, in from pyspark.mllib.recommendation import ALS, Rating File "/usr/local/spark/python/lib/pyspark.zip/pyspark/mllib/__init__.py", line 28, in import numpy ImportError: No module named 'numpy' at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298) at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438) at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421) at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252) at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37) at scala.collection.Iterator$class.foreach(Iterator.scala:893) at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28) at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59) at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104) at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48) at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310) at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28) at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302) at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28) at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289) at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28) at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:939) at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:939) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) at org.apache.spark.scheduler.Task.run(Task.scala:109) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1602) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1590) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1589) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1589) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831) at scala.Option.foreach(Option.scala:257) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1823) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1772) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1761) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099) at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:939) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) at org.apache.spark.rdd.RDD.withScope(RDD.scala:363) at org.apache.spark.rdd.RDD.collect(RDD.scala:938) at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:162) at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at py4j.Gateway.invoke(Gateway.java:282) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:238) at java.lang.Thread.run(Thread.java:748) Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last): File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 217, in main func, profiler, deserializer, serializer = read_command(pickleSer, infile) File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 59, in read_command command = serializer._read_with_length(file) File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 170, in _read_with_length return self.loads(obj) File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 559, in loads return pickle.loads(obj, encoding=encoding) File "./listenbrainz_spark.zip/listenbrainz_spark/train_models.py", line 11, in from pyspark.mllib.recommendation import ALS, Rating File "/usr/local/spark/python/lib/pyspark.zip/pyspark/mllib/__init__.py", line 28, in import numpy ImportError: No module named 'numpy' at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298) at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438) at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421) at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252) at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37) at scala.collection.Iterator$class.foreach(Iterator.scala:893) at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28) at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59) at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104) at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48) at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310) at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28) at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302) at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28) at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289) at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28) at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:939) at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:939) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) at org.apache.spark.scheduler.Task.run(Task.scala:109) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745)