Hello community,
Please let me know if I'm in the wrong forum for the following question.
I have created the following pyspark.sql query.
#%%
import findspark
findspark.init('/home/packt/spark-2.1.0-bin-hadoop2.7')
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ops').getOrCreate()
df = spark.read.csv('/home/packt/Downloads/Spark_DataFrames/Person_Person.csv',inferSchema=True,header=True)
df = spark.read.csv('/home/packt/Downloads/Spark_DataFrames/Person_Password.csv',inferSchema=True,header=True)
df.createOrReplaceTempView('Person_Person')
df.createOrReplaceTempView('Person_Password')
myresults = spark.sql("""SELECT
FirstName,
LastName,
PasswordHash
FROM Person_Person
INNER JOIN Person_Password
ON BusinessEntityID = BusinessEntityID""")
myresults.show()
The query attempts to carryout a simple Inner Join.
However, I'm getting the following error:
During handling of the above exception, another exception occurred:
AnalysisException Traceback (most recent call last)
in ()
14 FROM Person_Person
15 INNER JOIN Person_Password
---> 16 ON BusinessEntityID = BusinessEntityID""")
17 myresults.show()
~/spark-2.1.0-bin-hadoop2.7/python/pyspark/sql/session.py in sql(self, sqlQuery)
539 [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')]
540 """
--> 541 return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)
542
543 @since(2.0)
~/spark-2.1.0-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py in call(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
~/spark-2.1.0-bin-hadoop2.7/python/pyspark/sql/utils.py in deco(*a, **kw)
67 e.java_exception.getStackTrace()))
68 if s.startswith('org.apache.spark.sql.AnalysisException: '):
---> 69 raise AnalysisException(s.split(': ', 1)[1], stackTrace)
70 if s.startswith('org.apache.spark.sql.catalyst.analysis'):
71 raise AnalysisException(s.split(': ', 1)[1], stackTrace)
AnalysisException: "Reference 'BusinessEntityID' is ambiguous, could be: BusinessEntityID#639, BusinessEntityID#657.; line 7 pos 3"
Can someone let me know where I'm going wrong with the Inner Join?
Cheers