1+ use std:: sync:: Arc ;
2+
3+ use bytes:: Bytes ;
14use chrono:: { DateTime , Utc } ;
5+ use futures:: stream:: BoxStream ;
6+ use futures:: StreamExt ;
27use object_store:: { GetOptions , GetResult , ObjectStore } ;
3- use pyo3:: exceptions:: PyValueError ;
8+ use pyo3:: exceptions:: { PyStopAsyncIteration , PyStopIteration , PyValueError } ;
49use pyo3:: prelude:: * ;
510use pyo3:: types:: PyBytes ;
611use pyo3_object_store:: error:: { PyObjectStoreError , PyObjectStoreResult } ;
712use pyo3_object_store:: PyObjectStore ;
13+ use tokio:: sync:: Mutex ;
814
915use crate :: list:: PyObjectMeta ;
1016use crate :: runtime:: get_runtime;
1117
18+ /// 10MB default chunk size
19+ const DEFAULT_BYTES_CHUNK_SIZE : usize = 10 * 1024 * 1024 ;
20+
1221#[ derive( FromPyObject ) ]
1322pub ( crate ) struct PyGetOptions {
1423 if_match : Option < String > ,
@@ -54,7 +63,7 @@ impl PyGetResult {
5463 let runtime = get_runtime ( py) ?;
5564 py. allow_threads ( || {
5665 let bytes = runtime. block_on ( get_result. bytes ( ) ) ?;
57- Ok :: < _ , PyObjectStoreError > ( PyBytesWrapper ( bytes) )
66+ Ok :: < _ , PyObjectStoreError > ( PyBytesWrapper :: new ( bytes) )
5867 } )
5968 }
6069
@@ -68,7 +77,7 @@ impl PyGetResult {
6877 . bytes ( )
6978 . await
7079 . map_err ( PyObjectStoreError :: ObjectStoreError ) ?;
71- Ok ( PyBytesWrapper ( bytes) )
80+ Ok ( PyBytesWrapper :: new ( bytes) )
7281 } )
7382 }
7483
@@ -80,14 +89,129 @@ impl PyGetResult {
8089 . ok_or ( PyValueError :: new_err ( "Result has already been disposed." ) ) ?;
8190 Ok ( PyObjectMeta :: new ( inner. meta . clone ( ) ) )
8291 }
92+
93+ #[ pyo3( signature = ( min_chunk_size = DEFAULT_BYTES_CHUNK_SIZE ) ) ]
94+ fn stream ( & mut self , min_chunk_size : usize ) -> PyResult < PyBytesStream > {
95+ let get_result = self
96+ . 0
97+ . take ( )
98+ . ok_or ( PyValueError :: new_err ( "Result has already been disposed." ) ) ?;
99+ Ok ( PyBytesStream :: new ( get_result. into_stream ( ) , min_chunk_size) )
100+ }
101+
102+ fn __aiter__ ( & mut self ) -> PyResult < PyBytesStream > {
103+ self . stream ( DEFAULT_BYTES_CHUNK_SIZE )
104+ }
105+
106+ fn __iter__ ( & mut self ) -> PyResult < PyBytesStream > {
107+ self . stream ( DEFAULT_BYTES_CHUNK_SIZE )
108+ }
109+ }
110+
111+ #[ pyclass( name = "BytesStream" ) ]
112+ pub struct PyBytesStream {
113+ stream : Arc < Mutex < BoxStream < ' static , object_store:: Result < Bytes > > > > ,
114+ min_chunk_size : usize ,
115+ }
116+
117+ impl PyBytesStream {
118+ fn new ( stream : BoxStream < ' static , object_store:: Result < Bytes > > , min_chunk_size : usize ) -> Self {
119+ Self {
120+ stream : Arc :: new ( Mutex :: new ( stream) ) ,
121+ min_chunk_size,
122+ }
123+ }
124+ }
125+
126+ async fn next_stream (
127+ stream : Arc < Mutex < BoxStream < ' static , object_store:: Result < Bytes > > > > ,
128+ min_chunk_size : usize ,
129+ sync : bool ,
130+ ) -> PyResult < PyBytesWrapper > {
131+ let mut stream = stream. lock ( ) . await ;
132+ let mut buffers: Vec < Bytes > = vec ! [ ] ;
133+ loop {
134+ match stream. next ( ) . await {
135+ Some ( Ok ( bytes) ) => {
136+ buffers. push ( bytes) ;
137+ let total_buffer_len = buffers. iter ( ) . fold ( 0 , |acc, buf| acc + buf. len ( ) ) ;
138+ if total_buffer_len >= min_chunk_size {
139+ return Ok ( PyBytesWrapper :: new_multiple ( buffers) ) ;
140+ }
141+ }
142+ Some ( Err ( e) ) => return Err ( PyObjectStoreError :: from ( e) . into ( ) ) ,
143+ None => {
144+ if buffers. is_empty ( ) {
145+ // Depending on whether the iteration is sync or not, we raise either a
146+ // StopIteration or a StopAsyncIteration
147+ if sync {
148+ return Err ( PyStopIteration :: new_err ( "stream exhausted" ) ) ;
149+ } else {
150+ return Err ( PyStopAsyncIteration :: new_err ( "stream exhausted" ) ) ;
151+ }
152+ } else {
153+ return Ok ( PyBytesWrapper :: new_multiple ( buffers) ) ;
154+ }
155+ }
156+ } ;
157+ }
158+ }
159+
160+ #[ pymethods]
161+ impl PyBytesStream {
162+ fn __aiter__ ( slf : Py < Self > ) -> Py < Self > {
163+ slf
164+ }
165+
166+ fn __iter__ ( slf : Py < Self > ) -> Py < Self > {
167+ slf
168+ }
169+
170+ fn __anext__ < ' py > ( & ' py self , py : Python < ' py > ) -> PyResult < Bound < PyAny > > {
171+ let stream = self . stream . clone ( ) ;
172+ pyo3_async_runtimes:: tokio:: future_into_py (
173+ py,
174+ next_stream ( stream, self . min_chunk_size , false ) ,
175+ )
176+ }
177+
178+ fn __next__ < ' py > ( & ' py self , py : Python < ' py > ) -> PyResult < PyBytesWrapper > {
179+ let runtime = get_runtime ( py) ?;
180+ let stream = self . stream . clone ( ) ;
181+ runtime. block_on ( next_stream ( stream, self . min_chunk_size , true ) )
182+ }
83183}
84184
85- pub ( crate ) struct PyBytesWrapper ( bytes:: Bytes ) ;
185+ pub ( crate ) struct PyBytesWrapper ( Vec < Bytes > ) ;
186+
187+ impl PyBytesWrapper {
188+ pub fn new ( buf : Bytes ) -> Self {
189+ Self ( vec ! [ buf] )
190+ }
86191
87- // TODO: return buffer protocol object
192+ pub fn new_multiple ( buffers : Vec < Bytes > ) -> Self {
193+ Self ( buffers)
194+ }
195+ }
196+
197+ // TODO: return buffer protocol object? This isn't possible on an array of Bytes, so if you want to
198+ // support the buffer protocol in the future (e.g. for get_range) you may need to have a separate
199+ // wrapper of Bytes
88200impl IntoPy < PyObject > for PyBytesWrapper {
89201 fn into_py ( self , py : Python < ' _ > ) -> PyObject {
90- PyBytes :: new_bound ( py, & self . 0 ) . into_py ( py)
202+ let total_len = self . 0 . iter ( ) . fold ( 0 , |acc, buf| acc + buf. len ( ) ) ;
203+ // Copy all internal Bytes objects into a single PyBytes
204+ // Since our inner callback is infallible, this will only panic on out of memory
205+ PyBytes :: new_bound_with ( py, total_len, |target| {
206+ let mut offset = 0 ;
207+ for buf in self . 0 . iter ( ) {
208+ target[ offset..offset + buf. len ( ) ] . copy_from_slice ( buf) ;
209+ offset += buf. len ( ) ;
210+ }
211+ Ok ( ( ) )
212+ } )
213+ . unwrap ( )
214+ . into_py ( py)
91215 }
92216}
93217
@@ -144,7 +268,7 @@ pub(crate) fn get_range(
144268 let range = offset..offset + length;
145269 py. allow_threads ( || {
146270 let out = runtime. block_on ( store. as_ref ( ) . get_range ( & location. into ( ) , range) ) ?;
147- Ok :: < _ , PyObjectStoreError > ( PyBytesWrapper ( out) )
271+ Ok :: < _ , PyObjectStoreError > ( PyBytesWrapper :: new ( out) )
148272 } )
149273}
150274
@@ -163,7 +287,7 @@ pub(crate) fn get_range_async(
163287 . get_range ( & location. into ( ) , range)
164288 . await
165289 . map_err ( PyObjectStoreError :: ObjectStoreError ) ?;
166- Ok ( PyBytesWrapper ( out) )
290+ Ok ( PyBytesWrapper :: new ( out) )
167291 } )
168292}
169293
@@ -183,7 +307,7 @@ pub(crate) fn get_ranges(
183307 . collect :: < Vec < _ > > ( ) ;
184308 py. allow_threads ( || {
185309 let out = runtime. block_on ( store. as_ref ( ) . get_ranges ( & location. into ( ) , & ranges) ) ?;
186- Ok :: < _ , PyObjectStoreError > ( out. into_iter ( ) . map ( PyBytesWrapper ) . collect ( ) )
310+ Ok :: < _ , PyObjectStoreError > ( out. into_iter ( ) . map ( PyBytesWrapper :: new ) . collect ( ) )
187311 } )
188312}
189313
@@ -206,6 +330,6 @@ pub(crate) fn get_ranges_async(
206330 . get_ranges ( & location. into ( ) , & ranges)
207331 . await
208332 . map_err ( PyObjectStoreError :: ObjectStoreError ) ?;
209- Ok ( out. into_iter ( ) . map ( PyBytesWrapper ) . collect :: < Vec < _ > > ( ) )
333+ Ok ( out. into_iter ( ) . map ( PyBytesWrapper :: new ) . collect :: < Vec < _ > > ( ) )
210334 } )
211335}
0 commit comments