6:[["$","$Le",null,{}],["$","div",null,{"className":"min-h-screen bg-gray-100 p-6","children":[["$","$Lf",null,{}],["$","script",null,{"type":"application/ld+json","dangerouslySetInnerHTML":{"__html":"{\"@context\":\"https://schema.org\",\"@type\":\"QAPage\",\"mainEntity\":{\"@type\":\"Question\",\"name\":\"How to select distinct and non-null values from a dataframe column in pyspark\",\"text\":\"

How can select distinct and non-null values from a dataframe column in py-spark.

\\n\",\"author\":{\"@type\":\"Person\",\"name\":\"t1808\"},\"upvoteCount\":1,\"answerCount\":1,\"acceptedAnswer\":null}}"}}],["$","div",null,{"className":"bg-white shadow-md rounded-lg p-6 mb-6 relative","children":[["$","div",null,{"className":"absolute top-4 right-4 flex flex-wrap space-x-2","children":[["$","span","apache-spark",{"className":"bg-blue-600 text-white text-sm px-3 py-1 rounded-full","children":["$","$L10",null,{"href":"/discussion/tag/apache-spark/1","children":"apache-spark"}]}],["$","span","pyspark",{"className":"bg-blue-600 text-white text-sm px-3 py-1 rounded-full","children":["$","$L10",null,{"href":"/discussion/tag/pyspark/1","children":"pyspark"}]}]]}],["$","div",null,{"className":"flex items-center mb-4","children":[["$","img",null,{"src":"https://www.gravatar.com/avatar/9660d24ea30b880a0b433c6a14fcd8f0?s=256&d=identicon&r=PG&f=y&so-version=2","alt":"t1808","className":"w-16 h-16 rounded-full border"}],["$","div",null,{"className":"ml-4","children":[["$","a",null,{"href":"https://stackoverflow.com/users/11528412/t1808","target":"_blank","rel":"noopener noreferrer","className":"text-lg font-semibold text-blue-600 hover:underline","children":"t1808"}],["$","p",null,{"className":"text-sm text-gray-500","children":["Reputation: ",71]}]]}]]}],["$","h1",null,{"className":"text-2xl font-bold text-gray-800 mb-4","children":"How to select distinct and non-null values from a dataframe column in pyspark"}],["$","p",null,{"className":"text-gray-700 mt-4","dangerouslySetInnerHTML":{"__html":"

How can select distinct and non-null values from a dataframe column in py-spark.

\n"}}],["$","div",null,{"className":"text-gray-600 text-sm mt-4","children":[["$","p",null,{"children":["Upvotes: ",1]}],["$","p",null,{"children":["Views: ",3073]}]]}]]}],["$","div",null,{"className":"container mx-auto","children":[["$","h2",null,{"className":"text-2xl font-semibold text-gray-800 mb-6","children":["Answers (",1,")"]}],[["$","div","68235647",{"className":"bg-white shadow-md rounded-lg p-6 mb-6","children":[["$","div",null,{"className":"flex items-center mb-4","children":[["$","img",null,{"src":"https://www.gravatar.com/avatar/9660d24ea30b880a0b433c6a14fcd8f0?s=256&d=identicon&r=PG&f=y&so-version=2","alt":"t1808","className":"w-12 h-12 rounded-full border"}],["$","div",null,{"className":"ml-4","children":[["$","a",null,{"href":"https://stackoverflow.com/users/11528412/t1808","target":"_blank","rel":"noopener noreferrer","className":"text-lg font-semibold text-blue-600 hover:underline","children":"t1808"}],["$","p",null,{"className":"text-sm text-gray-500","children":["Reputation: ",71]}]]}]]}],["$","p",null,{"className":"text-gray-700 mb-4","dangerouslySetInnerHTML":{"__html":"

Ok, I figured it out...following is the command where i am selecting all the unique UserID's from column and excluding empty rows:

df.select('UserID').distinct().where(col("userid").isNotNull())\n

Still i believe there can possibly be better alternative.

\n"}}],["$","div",null,{"className":"text-gray-600 text-sm","children":["$","p",null,{"children":["Upvotes: ",1]}]}]]}]]]}],["$","div",null,{"className":"bg-white shadow-md rounded-lg p-6 mt-6","children":[["$","h2",null,{"className":"text-2xl font-semibold text-gray-800 mb-4","children":"Related Questions"}],["$","ul",null,{"className":"list-disc list-inside","children":[["$","li","39383557",{"className":"mb-2","children":["$","$L10",null,{"href":"/discussion/solution/39383557","className":"text-blue-600 hover:underline","children":"Show distinct column values in pyspark dataframe"}]}],["$","li","74608499",{"className":"mb-2","children":["$","$L10",null,{"href":"/discussion/solution/74608499","className":"text-blue-600 hover:underline","children":"How to list distinct values of pyspark dataframe wrt null values in another column"}]}],["$","li","39287729",{"className":"mb-2","children":["$","$L10",null,{"href":"/discussion/solution/39287729","className":"text-blue-600 hover:underline","children":"Filter rows by distinct values in one column in PySpark"}]}],["$","li","64805788",{"className":"mb-2","children":["$","$L10",null,{"href":"/discussion/solution/64805788","className":"text-blue-600 hover:underline","children":"Get distinct values of multiple columns"}]}],["$","li","64272570",{"className":"mb-2","children":["$","$L10",null,{"href":"/discussion/solution/64272570","className":"text-blue-600 hover:underline","children":"Pyspark - replace null values in column with distinct column value"}]}],["$","li","56414588",{"className":"mb-2","children":["$","$L10",null,{"href":"/discussion/solution/56414588","className":"text-blue-600 hover:underline","children":"How to find distinct values of multiple columns in Spark"}]}],["$","li","36179330",{"className":"mb-2","children":["$","$L10",null,{"href":"/discussion/solution/36179330","className":"text-blue-600 hover:underline","children":"Selecting values from non-null columns in a PySpark DataFrame"}]}],["$","li","52319958",{"className":"mb-2","children":["$","$L10",null,{"href":"/discussion/solution/52319958","className":"text-blue-600 hover:underline","children":"Pyspark - Select the distinct values from each column"}]}],["$","li","50692226",{"className":"mb-2","children":["$","$L10",null,{"href":"/discussion/solution/50692226","className":"text-blue-600 hover:underline","children":"Populate distinct of column based on another column in PySpark"}]}],["$","li","47211837",{"className":"mb-2","children":["$","$L10",null,{"href":"/discussion/solution/47211837","className":"text-blue-600 hover:underline","children":"Pyspark DataFrame select rows with distinct values, and rows with non-distinct values"}]}]]}]]}]]}],["$","$L11",null,{}],["$","$L12",null,{}],["$","$L13",null,{}],["$","$L14",null,{}],["$","$L15",null,{}]]

How to select distinct and non-null values from a dataframe column in pyspark

Answers (1)

Related Questions