@@ -18,23 +18,28 @@ class SalesforceCommitMessage(WriterCommitMessage):
18
18
19
19
class SalesforceDataSource (DataSource ):
20
20
"""
21
- A Salesforce streaming sink for PySpark to write data to Salesforce objects.
21
+ A Salesforce streaming datasource for PySpark to write data to Salesforce objects.
22
22
23
- This data sink enables writing streaming data from Spark to Salesforce using the
23
+ This datasource enables writing streaming data from Spark to Salesforce using the
24
24
Salesforce REST API. It supports common Salesforce objects like Account, Contact,
25
25
Opportunity, and custom objects.
26
26
27
- Note: This is a write-only sink , not a full bidirectional data source.
27
+ Note: This is a write-only datasource , not a full bidirectional data source.
28
28
29
29
Name: `salesforce`
30
30
31
31
Notes
32
32
-----
33
33
- Requires the `simple-salesforce` library for Salesforce API integration
34
- - **Write-only sink **: Only supports streaming write operations (no read operations)
34
+ - **Write-only datasource **: Only supports streaming write operations (no read operations)
35
35
- Uses Salesforce username/password/security token authentication
36
36
- Supports batch writing with Salesforce Composite Tree API for efficient processing
37
37
- Implements exactly-once semantics through Spark's checkpoint mechanism
38
+ - If a streaming write job fails and is resumed from the checkpoint,
39
+ it will not overwrite records already written in Salesforce;
40
+ it resumes from the last committed offset.
41
+ However, if records were written to Salesforce but not yet committed at the time of failure,
42
+ duplicate records may occur after recovery.
38
43
39
44
Parameters
40
45
----------
@@ -57,7 +62,7 @@ class SalesforceDataSource(DataSource):
57
62
58
63
Examples
59
64
--------
60
- Register the Salesforce sink :
65
+ Register the Salesforce Datasource :
61
66
62
67
>>> from pyspark_datasources import SalesforceDataSource
63
68
>>> spark.dataSource.register(SalesforceDataSource)
@@ -78,9 +83,9 @@ class SalesforceDataSource(DataSource):
78
83
... (col("value") * 100000).cast("double").alias("AnnualRevenue")
79
84
... )
80
85
>>>
81
- >>> # Write to Salesforce using the sink
86
+ >>> # Write to Salesforce using the datasource
82
87
>>> query = account_data.writeStream \\
83
- ... .format("salesforce") \\
88
+ ... .format("pyspark.datasource. salesforce") \\
84
89
... .option("username", "your-username@company.com") \\
85
90
... .option("password", "your-password") \\
86
91
... .option("security_token", "your-security-token") \\
@@ -98,7 +103,7 @@ class SalesforceDataSource(DataSource):
98
103
... )
99
104
>>>
100
105
>>> query = contact_data.writeStream \\
101
- ... .format("salesforce") \\
106
+ ... .format("pyspark.datasource. salesforce") \\
102
107
... .option("username", "your-username@company.com") \\
103
108
... .option("password", "your-password") \\
104
109
... .option("security_token", "your-security-token") \\
@@ -114,7 +119,7 @@ class SalesforceDataSource(DataSource):
114
119
... )
115
120
>>>
116
121
>>> query = custom_data.writeStream \\
117
- ... .format("salesforce") \\
122
+ ... .format("pyspark.datasource. salesforce") \\
118
123
... .option("username", "your-username@company.com") \\
119
124
... .option("password", "your-password") \\
120
125
... .option("security_token", "your-security-token") \\
@@ -128,7 +133,7 @@ class SalesforceDataSource(DataSource):
128
133
>>> contact_schema = "FirstName STRING NOT NULL, LastName STRING NOT NULL, Email STRING, Phone STRING"
129
134
>>>
130
135
>>> query = contact_data.writeStream \\
131
- ... .format("salesforce") \\
136
+ ... .format("pyspark.datasource. salesforce") \\
132
137
... .option("username", "your-username@company.com") \\
133
138
... .option("password", "your-password") \\
134
139
... .option("security_token", "your-security-token") \\
@@ -148,7 +153,7 @@ class SalesforceDataSource(DataSource):
148
153
... )
149
154
>>>
150
155
>>> query = opportunity_data.writeStream \\
151
- ... .format("salesforce") \\
156
+ ... .format("pyspark.datasource. salesforce") \\
152
157
... .option("username", "your-username@company.com") \\
153
158
... .option("password", "your-password") \\
154
159
... .option("security_token", "your-security-token") \\
@@ -159,7 +164,7 @@ class SalesforceDataSource(DataSource):
159
164
160
165
Key Features:
161
166
162
- - **Write-only sink **: Designed specifically for writing data to Salesforce
167
+ - **Write-only datasource **: Designed specifically for writing data to Salesforce
163
168
- **Batch processing**: Uses Salesforce Composite Tree API for efficient bulk writes
164
169
- **Exactly-once semantics**: Integrates with Spark's checkpoint mechanism
165
170
- **Error handling**: Graceful fallback to individual record creation if batch fails
@@ -168,8 +173,8 @@ class SalesforceDataSource(DataSource):
168
173
169
174
@classmethod
170
175
def name (cls ) -> str :
171
- """Return the short name for this Salesforce sink ."""
172
- return "salesforce"
176
+ """Return the short name for this Salesforce datasource ."""
177
+ return "pyspark.datasource. salesforce"
173
178
174
179
def schema (self ) -> str :
175
180
"""
@@ -196,12 +201,12 @@ def schema(self) -> str:
196
201
"""
197
202
198
203
def streamWriter (self , schema : StructType , overwrite : bool ) -> "SalesforceStreamWriter" :
199
- """Create a stream writer for Salesforce sink integration."""
204
+ """Create a stream writer for Salesforce datasource integration."""
200
205
return SalesforceStreamWriter (schema , self .options )
201
206
202
207
203
208
class SalesforceStreamWriter (DataSourceStreamWriter ):
204
- """Stream writer implementation for Salesforce sink integration."""
209
+ """Stream writer implementation for Salesforce datasource integration."""
205
210
206
211
def __init__ (self , schema : StructType , options : Dict [str , str ]):
207
212
self .schema = schema
0 commit comments