<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: spark.read.load return dataframe bigger than actual csv file. in HPE Ezmeral Software platform</title>
    <link>https://community.hpe.com/t5/hpe-ezmeral-software-platform/spark-read-load-return-dataframe-bigger-than-actual-csv-file/m-p/7170105#M234</link>
    <description>&lt;P&gt;The issue has already been addressed. Please install the following RPM&amp;nbsp;mapr-spark-3.2.0.1.202204272354-1.noarch which is&amp;nbsp; uploaded to SFTP [&lt;A href="https://sftp.mapr.com/" target="_blank"&gt;https://sftp.mapr.com/&lt;/A&gt;] under path - /ecosystem/rpm/spark/mep-8.1.0. The username to login is "maprpatches".&lt;/P&gt;&lt;P&gt;Maven dependency :&lt;/P&gt;&lt;P&gt;&amp;lt;dependency&amp;gt;&lt;BR /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;&amp;lt;groupId&amp;gt;org.apache.spark&amp;lt;/groupId&amp;gt;&lt;BR /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;&amp;lt;artifactId&amp;gt;spark-core_2.12&amp;lt;/artifactId&amp;gt;&lt;BR /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;&amp;lt;version&amp;gt;3.2.0.1-eep-810&amp;lt;/version&amp;gt;&lt;BR /&gt;&amp;lt;/dependency&amp;gt;&lt;/P&gt;</description>
    <pubDate>Fri, 08 Jul 2022 02:22:28 GMT</pubDate>
    <dc:creator>Vinayak_Meghraj</dc:creator>
    <dc:date>2022-07-08T02:22:28Z</dc:date>
    <item>
      <title>spark.read.load return dataframe bigger than actual csv file.</title>
      <link>https://community.hpe.com/t5/hpe-ezmeral-software-platform/spark-read-load-return-dataframe-bigger-than-actual-csv-file/m-p/7166482#M228</link>
      <description>&lt;P&gt;Hi ,&lt;BR /&gt;&lt;BR /&gt;I try to test Ezmeral 7.0.0 with EEP8.1.0 (Spark 3.2.0).&lt;BR /&gt;When I use spark.read.load fnction to load csv file (around 5MB size) , dataframe recodes is not much csv recodes.&lt;BR /&gt;CSV file recodes is 51000.&lt;BR /&gt;Dataframe recode(result of df.count()) is 51953.&lt;BR /&gt;(Executer might load same recode twice.)&lt;BR /&gt;&lt;BR /&gt;CSV file size less then 4MB ,&amp;nbsp; don't reproduce this issue.&lt;BR /&gt;If you have any solution , please share me.&lt;BR /&gt;&lt;BR /&gt;My sample pyspark code is below:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;# -*- coding: utf-8 -*-
import sys
from pyspark.sql import SparkSession

if __name__ == '__main__':
    # spark session
    spark = SparkSession.builder.getOrCreate()

    mapr_path = 'maprfs:/rawdata/test.csv'
    df = spark.read.load(mapr_path, format="csv", header=True)
    print(df.count())

    sys.exit(0)&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;Thanks in advance.&lt;BR /&gt;&lt;BR /&gt;&lt;STRONG&gt;Additional:&lt;BR /&gt;&lt;/STRONG&gt;I try to run same program with OSS environment&amp;nbsp; (hadoop 3.2 and spark 3.2.0).&lt;BR /&gt;Don't reproduce this issue , result is correct. I guess EEP 8.1.0 have problem..&lt;/P&gt;</description>
      <pubDate>Wed, 18 May 2022 05:15:41 GMT</pubDate>
      <guid>https://community.hpe.com/t5/hpe-ezmeral-software-platform/spark-read-load-return-dataframe-bigger-than-actual-csv-file/m-p/7166482#M228</guid>
      <dc:creator>smarte_basis</dc:creator>
      <dc:date>2022-05-18T05:15:41Z</dc:date>
    </item>
    <item>
      <title>Query: https://hpe.to/6608zMuXQ.load return dataframe bigger than actual csv file.</title>
      <link>https://community.hpe.com/t5/hpe-ezmeral-software-platform/spark-read-load-return-dataframe-bigger-than-actual-csv-file/m-p/7166487#M229</link>
      <description>&lt;P style="margin: 0;"&gt;&lt;STRONG&gt;System recommended content:&lt;/STRONG&gt;&lt;/P&gt;
&lt;P style="margin: 0;"&gt;1. &lt;A href="https://hpe.to/6604zMuXq" target="_blank" rel="noopener"&gt;HPE Ezmeral Data Fabric 7.0 Documentation |  SparkSQL and DataFrames&lt;/A&gt;&lt;/P&gt;
&lt;P style="margin: 0;"&gt;&amp;nbsp;&lt;/P&gt;
&lt;P style="margin: 0;"&gt;Please click on "Thumbs Up/Kudo" icon to give a "Kudo".&lt;/P&gt;
&lt;P style="margin: 0;"&gt;&amp;nbsp;&lt;/P&gt;
&lt;P style="margin: 0;"&gt;Thank you for being a HPE valuable community member.&lt;/P&gt;</description>
      <pubDate>Tue, 17 May 2022 08:00:03 GMT</pubDate>
      <guid>https://community.hpe.com/t5/hpe-ezmeral-software-platform/spark-read-load-return-dataframe-bigger-than-actual-csv-file/m-p/7166487#M229</guid>
      <dc:creator>support_s</dc:creator>
      <dc:date>2022-05-17T08:00:03Z</dc:date>
    </item>
    <item>
      <title>Re: spark.read.load return dataframe bigger than actual csv file.</title>
      <link>https://community.hpe.com/t5/hpe-ezmeral-software-platform/spark-read-load-return-dataframe-bigger-than-actual-csv-file/m-p/7170105#M234</link>
      <description>&lt;P&gt;The issue has already been addressed. Please install the following RPM&amp;nbsp;mapr-spark-3.2.0.1.202204272354-1.noarch which is&amp;nbsp; uploaded to SFTP [&lt;A href="https://sftp.mapr.com/" target="_blank"&gt;https://sftp.mapr.com/&lt;/A&gt;] under path - /ecosystem/rpm/spark/mep-8.1.0. The username to login is "maprpatches".&lt;/P&gt;&lt;P&gt;Maven dependency :&lt;/P&gt;&lt;P&gt;&amp;lt;dependency&amp;gt;&lt;BR /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;&amp;lt;groupId&amp;gt;org.apache.spark&amp;lt;/groupId&amp;gt;&lt;BR /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;&amp;lt;artifactId&amp;gt;spark-core_2.12&amp;lt;/artifactId&amp;gt;&lt;BR /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;&amp;lt;version&amp;gt;3.2.0.1-eep-810&amp;lt;/version&amp;gt;&lt;BR /&gt;&amp;lt;/dependency&amp;gt;&lt;/P&gt;</description>
      <pubDate>Fri, 08 Jul 2022 02:22:28 GMT</pubDate>
      <guid>https://community.hpe.com/t5/hpe-ezmeral-software-platform/spark-read-load-return-dataframe-bigger-than-actual-csv-file/m-p/7170105#M234</guid>
      <dc:creator>Vinayak_Meghraj</dc:creator>
      <dc:date>2022-07-08T02:22:28Z</dc:date>
    </item>
  </channel>
</rss>

