pyo3/types/
string.rs

1#[cfg(not(Py_LIMITED_API))]
2use crate::exceptions::PyUnicodeDecodeError;
3use crate::ffi_ptr_ext::FfiPtrExt;
4use crate::instance::Borrowed;
5use crate::py_result_ext::PyResultExt;
6use crate::types::any::PyAnyMethods;
7use crate::types::bytes::PyBytesMethods;
8use crate::types::PyBytes;
9#[cfg(feature = "gil-refs")]
10use crate::PyNativeType;
11use crate::{ffi, Bound, IntoPy, Py, PyAny, PyResult, Python};
12use std::borrow::Cow;
13use std::str;
14
15/// Represents raw data backing a Python `str`.
16///
17/// Python internally stores strings in various representations. This enumeration
18/// represents those variations.
19#[cfg(not(Py_LIMITED_API))]
20#[derive(Clone, Copy, Debug, PartialEq, Eq)]
21pub enum PyStringData<'a> {
22    /// UCS1 representation.
23    Ucs1(&'a [u8]),
24
25    /// UCS2 representation.
26    Ucs2(&'a [u16]),
27
28    /// UCS4 representation.
29    Ucs4(&'a [u32]),
30}
31
32#[cfg(not(Py_LIMITED_API))]
33impl<'a> PyStringData<'a> {
34    /// Obtain the raw bytes backing this instance as a [u8] slice.
35    pub fn as_bytes(&self) -> &[u8] {
36        match self {
37            Self::Ucs1(s) => s,
38            Self::Ucs2(s) => unsafe {
39                std::slice::from_raw_parts(s.as_ptr().cast(), s.len() * self.value_width_bytes())
40            },
41            Self::Ucs4(s) => unsafe {
42                std::slice::from_raw_parts(s.as_ptr().cast(), s.len() * self.value_width_bytes())
43            },
44        }
45    }
46
47    /// Size in bytes of each value/item in the underlying slice.
48    #[inline]
49    pub fn value_width_bytes(&self) -> usize {
50        match self {
51            Self::Ucs1(_) => 1,
52            Self::Ucs2(_) => 2,
53            Self::Ucs4(_) => 4,
54        }
55    }
56
57    /// Convert the raw data to a Rust string.
58    ///
59    /// For UCS-1 / UTF-8, returns a borrow into the original slice. For UCS-2 and UCS-4,
60    /// returns an owned string.
61    ///
62    /// Returns [PyUnicodeDecodeError] if the string data isn't valid in its purported
63    /// storage format. This should only occur for strings that were created via Python
64    /// C APIs that skip input validation (like `PyUnicode_FromKindAndData`) and should
65    /// never occur for strings that were created from Python code.
66    pub fn to_string(self, py: Python<'_>) -> PyResult<Cow<'a, str>> {
67        use std::ffi::CStr;
68        match self {
69            Self::Ucs1(data) => match str::from_utf8(data) {
70                Ok(s) => Ok(Cow::Borrowed(s)),
71                Err(e) => Err(PyUnicodeDecodeError::new_utf8_bound(py, data, e)?.into()),
72            },
73            Self::Ucs2(data) => match String::from_utf16(data) {
74                Ok(s) => Ok(Cow::Owned(s)),
75                Err(e) => {
76                    let mut message = e.to_string().as_bytes().to_vec();
77                    message.push(0);
78
79                    Err(PyUnicodeDecodeError::new_bound(
80                        py,
81                        ffi::c_str!("utf-16"),
82                        self.as_bytes(),
83                        0..self.as_bytes().len(),
84                        CStr::from_bytes_with_nul(&message).unwrap(),
85                    )?
86                    .into())
87                }
88            },
89            Self::Ucs4(data) => match data.iter().map(|&c| std::char::from_u32(c)).collect() {
90                Some(s) => Ok(Cow::Owned(s)),
91                None => Err(PyUnicodeDecodeError::new_bound(
92                    py,
93                    ffi::c_str!("utf-32"),
94                    self.as_bytes(),
95                    0..self.as_bytes().len(),
96                    ffi::c_str!("error converting utf-32"),
97                )?
98                .into()),
99            },
100        }
101    }
102
103    /// Convert the raw data to a Rust string, possibly with data loss.
104    ///
105    /// Invalid code points will be replaced with `U+FFFD REPLACEMENT CHARACTER`.
106    ///
107    /// Returns a borrow into original data, when possible, or owned data otherwise.
108    ///
109    /// The return value of this function should only disagree with [Self::to_string]
110    /// when that method would error.
111    pub fn to_string_lossy(self) -> Cow<'a, str> {
112        match self {
113            Self::Ucs1(data) => String::from_utf8_lossy(data),
114            Self::Ucs2(data) => Cow::Owned(String::from_utf16_lossy(data)),
115            Self::Ucs4(data) => Cow::Owned(
116                data.iter()
117                    .map(|&c| std::char::from_u32(c).unwrap_or('\u{FFFD}'))
118                    .collect(),
119            ),
120        }
121    }
122}
123
124/// Represents a Python `string` (a Unicode string object).
125///
126/// Values of this type are accessed via PyO3's smart pointers, e.g. as
127/// [`Py<PyString>`][crate::Py] or [`Bound<'py, PyString>`][Bound].
128///
129/// For APIs available on `str` objects, see the [`PyStringMethods`] trait which is implemented for
130/// [`Bound<'py, PyString>`][Bound].
131///
132/// # Equality
133///
134/// For convenience, [`Bound<'py, PyString>`] implements [`PartialEq<str>`] to allow comparing the
135/// data in the Python string to a Rust UTF-8 string slice.
136///
137/// This is not always the most appropriate way to compare Python strings, as Python string subclasses
138/// may have different equality semantics. In situations where subclasses overriding equality might be
139/// relevant, use [`PyAnyMethods::eq`], at cost of the additional overhead of a Python method call.
140///
141/// ```rust
142/// # use pyo3::prelude::*;
143/// use pyo3::types::PyString;
144///
145/// # Python::with_gil(|py| {
146/// let py_string = PyString::new_bound(py, "foo");
147/// // via PartialEq<str>
148/// assert_eq!(py_string, "foo");
149///
150/// // via Python equality
151/// assert!(py_string.as_any().eq("foo").unwrap());
152/// # });
153/// ```
154#[repr(transparent)]
155pub struct PyString(PyAny);
156
157pyobject_native_type_core!(PyString, pyobject_native_static_type_object!(ffi::PyUnicode_Type), #checkfunction=ffi::PyUnicode_Check);
158
159impl PyString {
160    /// Creates a new Python string object.
161    ///
162    /// Panics if out of memory.
163    pub fn new_bound<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> {
164        let ptr = s.as_ptr().cast();
165        let len = s.len() as ffi::Py_ssize_t;
166        unsafe {
167            ffi::PyUnicode_FromStringAndSize(ptr, len)
168                .assume_owned(py)
169                .downcast_into_unchecked()
170        }
171    }
172
173    /// Intern the given string
174    ///
175    /// This will return a reference to the same Python string object if called repeatedly with the same string.
176    ///
177    /// Note that while this is more memory efficient than [`PyString::new_bound`], it unconditionally allocates a
178    /// temporary Python string object and is thereby slower than [`PyString::new_bound`].
179    ///
180    /// Panics if out of memory.
181    pub fn intern_bound<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> {
182        let ptr = s.as_ptr().cast();
183        let len = s.len() as ffi::Py_ssize_t;
184        unsafe {
185            let mut ob = ffi::PyUnicode_FromStringAndSize(ptr, len);
186            if !ob.is_null() {
187                ffi::PyUnicode_InternInPlace(&mut ob);
188            }
189            ob.assume_owned(py).downcast_into_unchecked()
190        }
191    }
192
193    /// Attempts to create a Python string from a Python [bytes-like object].
194    ///
195    /// [bytes-like object]: (https://docs.python.org/3/glossary.html#term-bytes-like-object).
196    pub fn from_object_bound<'py>(
197        src: &Bound<'py, PyAny>,
198        encoding: &str,
199        errors: &str,
200    ) -> PyResult<Bound<'py, PyString>> {
201        unsafe {
202            ffi::PyUnicode_FromEncodedObject(
203                src.as_ptr(),
204                encoding.as_ptr().cast(),
205                errors.as_ptr().cast(),
206            )
207            .assume_owned_or_err(src.py())
208            .downcast_into_unchecked()
209        }
210    }
211}
212
213#[cfg(feature = "gil-refs")]
214impl PyString {
215    /// Deprecated form of [`PyString::new_bound`].
216    #[deprecated(
217        since = "0.21.0",
218        note = "`PyString::new` will be replaced by `PyString::new_bound` in a future PyO3 version"
219    )]
220    pub fn new<'py>(py: Python<'py>, s: &str) -> &'py Self {
221        Self::new_bound(py, s).into_gil_ref()
222    }
223
224    /// Deprecated form of [`PyString::intern_bound`].
225    #[deprecated(
226        since = "0.21.0",
227        note = "`PyString::intern` will be replaced by `PyString::intern_bound` in a future PyO3 version"
228    )]
229    pub fn intern<'py>(py: Python<'py>, s: &str) -> &'py Self {
230        Self::intern_bound(py, s).into_gil_ref()
231    }
232
233    /// Deprecated form of [`PyString::from_object_bound`].
234    #[deprecated(
235        since = "0.21.0",
236        note = "`PyString::from_object` will be replaced by `PyString::from_object_bound` in a future PyO3 version"
237    )]
238    pub fn from_object<'py>(src: &'py PyAny, encoding: &str, errors: &str) -> PyResult<&'py Self> {
239        Self::from_object_bound(&src.as_borrowed(), encoding, errors).map(Bound::into_gil_ref)
240    }
241
242    /// Gets the Python string as a Rust UTF-8 string slice.
243    ///
244    /// Returns a `UnicodeEncodeError` if the input is not valid unicode
245    /// (containing unpaired surrogates).
246    pub fn to_str(&self) -> PyResult<&str> {
247        #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
248        {
249            self.as_borrowed().to_str()
250        }
251
252        #[cfg(not(any(Py_3_10, not(Py_LIMITED_API))))]
253        {
254            let bytes = self.as_borrowed().encode_utf8()?.into_gil_ref();
255            Ok(unsafe { std::str::from_utf8_unchecked(bytes.as_bytes()) })
256        }
257    }
258
259    /// Converts the `PyString` into a Rust string, avoiding copying when possible.
260    ///
261    /// Returns a `UnicodeEncodeError` if the input is not valid unicode
262    /// (containing unpaired surrogates).
263    pub fn to_cow(&self) -> PyResult<Cow<'_, str>> {
264        self.as_borrowed().to_cow()
265    }
266
267    /// Converts the `PyString` into a Rust string.
268    ///
269    /// Unpaired surrogates invalid UTF-8 sequences are
270    /// replaced with `U+FFFD REPLACEMENT CHARACTER`.
271    pub fn to_string_lossy(&self) -> Cow<'_, str> {
272        self.as_borrowed().to_string_lossy()
273    }
274
275    /// Obtains the raw data backing the Python string.
276    ///
277    /// If the Python string object was created through legacy APIs, its internal storage format
278    /// will be canonicalized before data is returned.
279    ///
280    /// # Safety
281    ///
282    /// This function implementation relies on manually decoding a C bitfield. In practice, this
283    /// works well on common little-endian architectures such as x86_64, where the bitfield has a
284    /// common representation (even if it is not part of the C spec). The PyO3 CI tests this API on
285    /// x86_64 platforms.
286    ///
287    /// By using this API, you accept responsibility for testing that PyStringData behaves as
288    /// expected on the targets where you plan to distribute your software.
289    #[cfg(not(any(Py_LIMITED_API, GraalPy, PyPy)))]
290    pub unsafe fn data(&self) -> PyResult<PyStringData<'_>> {
291        self.as_borrowed().data()
292    }
293}
294
295/// Implementation of functionality for [`PyString`].
296///
297/// These methods are defined for the `Bound<'py, PyString>` smart pointer, so to use method call
298/// syntax these methods are separated into a trait, because stable Rust does not yet support
299/// `arbitrary_self_types`.
300#[doc(alias = "PyString")]
301pub trait PyStringMethods<'py>: crate::sealed::Sealed {
302    /// Gets the Python string as a Rust UTF-8 string slice.
303    ///
304    /// Returns a `UnicodeEncodeError` if the input is not valid unicode
305    /// (containing unpaired surrogates).
306    #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
307    fn to_str(&self) -> PyResult<&str>;
308
309    /// Converts the `PyString` into a Rust string, avoiding copying when possible.
310    ///
311    /// Returns a `UnicodeEncodeError` if the input is not valid unicode
312    /// (containing unpaired surrogates).
313    fn to_cow(&self) -> PyResult<Cow<'_, str>>;
314
315    /// Converts the `PyString` into a Rust string.
316    ///
317    /// Unpaired surrogates invalid UTF-8 sequences are
318    /// replaced with `U+FFFD REPLACEMENT CHARACTER`.
319    fn to_string_lossy(&self) -> Cow<'_, str>;
320
321    /// Encodes this string as a Python `bytes` object, using UTF-8 encoding.
322    fn encode_utf8(&self) -> PyResult<Bound<'py, PyBytes>>;
323
324    /// Obtains the raw data backing the Python string.
325    ///
326    /// If the Python string object was created through legacy APIs, its internal storage format
327    /// will be canonicalized before data is returned.
328    ///
329    /// # Safety
330    ///
331    /// This function implementation relies on manually decoding a C bitfield. In practice, this
332    /// works well on common little-endian architectures such as x86_64, where the bitfield has a
333    /// common representation (even if it is not part of the C spec). The PyO3 CI tests this API on
334    /// x86_64 platforms.
335    ///
336    /// By using this API, you accept responsibility for testing that PyStringData behaves as
337    /// expected on the targets where you plan to distribute your software.
338    #[cfg(not(any(Py_LIMITED_API, GraalPy, PyPy)))]
339    unsafe fn data(&self) -> PyResult<PyStringData<'_>>;
340}
341
342impl<'py> PyStringMethods<'py> for Bound<'py, PyString> {
343    #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
344    fn to_str(&self) -> PyResult<&str> {
345        self.as_borrowed().to_str()
346    }
347
348    fn to_cow(&self) -> PyResult<Cow<'_, str>> {
349        self.as_borrowed().to_cow()
350    }
351
352    fn to_string_lossy(&self) -> Cow<'_, str> {
353        self.as_borrowed().to_string_lossy()
354    }
355
356    fn encode_utf8(&self) -> PyResult<Bound<'py, PyBytes>> {
357        unsafe {
358            ffi::PyUnicode_AsUTF8String(self.as_ptr())
359                .assume_owned_or_err(self.py())
360                .downcast_into_unchecked::<PyBytes>()
361        }
362    }
363
364    #[cfg(not(any(Py_LIMITED_API, GraalPy, PyPy)))]
365    unsafe fn data(&self) -> PyResult<PyStringData<'_>> {
366        self.as_borrowed().data()
367    }
368}
369
370impl<'a> Borrowed<'a, '_, PyString> {
371    #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
372    #[allow(clippy::wrong_self_convention)]
373    pub(crate) fn to_str(self) -> PyResult<&'a str> {
374        // PyUnicode_AsUTF8AndSize only available on limited API starting with 3.10.
375        let mut size: ffi::Py_ssize_t = 0;
376        let data: *const u8 =
377            unsafe { ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size).cast() };
378        if data.is_null() {
379            Err(crate::PyErr::fetch(self.py()))
380        } else {
381            Ok(unsafe {
382                std::str::from_utf8_unchecked(std::slice::from_raw_parts(data, size as usize))
383            })
384        }
385    }
386
387    #[allow(clippy::wrong_self_convention)]
388    pub(crate) fn to_cow(self) -> PyResult<Cow<'a, str>> {
389        // TODO: this method can probably be deprecated once Python 3.9 support is dropped,
390        // because all versions then support the more efficient `to_str`.
391        #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
392        {
393            self.to_str().map(Cow::Borrowed)
394        }
395
396        #[cfg(not(any(Py_3_10, not(Py_LIMITED_API))))]
397        {
398            let bytes = self.encode_utf8()?;
399            Ok(Cow::Owned(
400                unsafe { str::from_utf8_unchecked(bytes.as_bytes()) }.to_owned(),
401            ))
402        }
403    }
404
405    #[allow(clippy::wrong_self_convention)]
406    fn to_string_lossy(self) -> Cow<'a, str> {
407        let ptr = self.as_ptr();
408        let py = self.py();
409
410        #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
411        if let Ok(s) = self.to_str() {
412            return Cow::Borrowed(s);
413        }
414
415        let bytes = unsafe {
416            ffi::PyUnicode_AsEncodedString(
417                ptr,
418                ffi::c_str!("utf-8").as_ptr(),
419                ffi::c_str!("surrogatepass").as_ptr(),
420            )
421            .assume_owned(py)
422            .downcast_into_unchecked::<PyBytes>()
423        };
424        Cow::Owned(String::from_utf8_lossy(bytes.as_bytes()).into_owned())
425    }
426
427    #[cfg(not(any(Py_LIMITED_API, GraalPy, PyPy)))]
428    unsafe fn data(self) -> PyResult<PyStringData<'a>> {
429        let ptr = self.as_ptr();
430
431        #[cfg(not(Py_3_12))]
432        #[allow(deprecated)]
433        {
434            let ready = ffi::PyUnicode_READY(ptr);
435            if ready != 0 {
436                // Exception was created on failure.
437                return Err(crate::PyErr::fetch(self.py()));
438            }
439        }
440
441        // The string should be in its canonical form after calling `PyUnicode_READY()`.
442        // And non-canonical form not possible after Python 3.12. So it should be safe
443        // to call these APIs.
444        let length = ffi::PyUnicode_GET_LENGTH(ptr) as usize;
445        let raw_data = ffi::PyUnicode_DATA(ptr);
446        let kind = ffi::PyUnicode_KIND(ptr);
447
448        match kind {
449            ffi::PyUnicode_1BYTE_KIND => Ok(PyStringData::Ucs1(std::slice::from_raw_parts(
450                raw_data as *const u8,
451                length,
452            ))),
453            ffi::PyUnicode_2BYTE_KIND => Ok(PyStringData::Ucs2(std::slice::from_raw_parts(
454                raw_data as *const u16,
455                length,
456            ))),
457            ffi::PyUnicode_4BYTE_KIND => Ok(PyStringData::Ucs4(std::slice::from_raw_parts(
458                raw_data as *const u32,
459                length,
460            ))),
461            _ => unreachable!(),
462        }
463    }
464}
465
466impl Py<PyString> {
467    /// Gets the Python string as a Rust UTF-8 string slice.
468    ///
469    /// Returns a `UnicodeEncodeError` if the input is not valid unicode
470    /// (containing unpaired surrogates).
471    ///
472    /// Because `str` objects are immutable, the returned slice is independent of
473    /// the GIL lifetime.
474    #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
475    pub fn to_str<'a>(&'a self, py: Python<'_>) -> PyResult<&'a str> {
476        self.bind_borrowed(py).to_str()
477    }
478
479    /// Converts the `PyString` into a Rust string, avoiding copying when possible.
480    ///
481    /// Returns a `UnicodeEncodeError` if the input is not valid unicode
482    /// (containing unpaired surrogates).
483    ///
484    /// Because `str` objects are immutable, the returned slice is independent of
485    /// the GIL lifetime.
486    pub fn to_cow<'a>(&'a self, py: Python<'_>) -> PyResult<Cow<'a, str>> {
487        self.bind_borrowed(py).to_cow()
488    }
489
490    /// Converts the `PyString` into a Rust string.
491    ///
492    /// Unpaired surrogates invalid UTF-8 sequences are
493    /// replaced with `U+FFFD REPLACEMENT CHARACTER`.
494    ///
495    /// Because `str` objects are immutable, the returned slice is independent of
496    /// the GIL lifetime.
497    pub fn to_string_lossy<'a>(&'a self, py: Python<'_>) -> Cow<'a, str> {
498        self.bind_borrowed(py).to_string_lossy()
499    }
500}
501
502impl IntoPy<Py<PyString>> for Bound<'_, PyString> {
503    fn into_py(self, _py: Python<'_>) -> Py<PyString> {
504        self.unbind()
505    }
506}
507
508impl IntoPy<Py<PyString>> for &Bound<'_, PyString> {
509    fn into_py(self, _py: Python<'_>) -> Py<PyString> {
510        self.clone().unbind()
511    }
512}
513
514impl IntoPy<Py<PyString>> for &'_ Py<PyString> {
515    fn into_py(self, py: Python<'_>) -> Py<PyString> {
516        self.clone_ref(py)
517    }
518}
519
520/// Compares whether the data in the Python string is equal to the given UTF8.
521///
522/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
523impl PartialEq<str> for Bound<'_, PyString> {
524    #[inline]
525    fn eq(&self, other: &str) -> bool {
526        self.as_borrowed() == *other
527    }
528}
529
530/// Compares whether the data in the Python string is equal to the given UTF8.
531///
532/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
533impl PartialEq<&'_ str> for Bound<'_, PyString> {
534    #[inline]
535    fn eq(&self, other: &&str) -> bool {
536        self.as_borrowed() == **other
537    }
538}
539
540/// Compares whether the data in the Python string is equal to the given UTF8.
541///
542/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
543impl PartialEq<Bound<'_, PyString>> for str {
544    #[inline]
545    fn eq(&self, other: &Bound<'_, PyString>) -> bool {
546        *self == other.as_borrowed()
547    }
548}
549
550/// Compares whether the data in the Python string is equal to the given UTF8.
551///
552/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
553impl PartialEq<&'_ Bound<'_, PyString>> for str {
554    #[inline]
555    fn eq(&self, other: &&Bound<'_, PyString>) -> bool {
556        *self == other.as_borrowed()
557    }
558}
559
560/// Compares whether the data in the Python string is equal to the given UTF8.
561///
562/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
563impl PartialEq<Bound<'_, PyString>> for &'_ str {
564    #[inline]
565    fn eq(&self, other: &Bound<'_, PyString>) -> bool {
566        **self == other.as_borrowed()
567    }
568}
569
570/// Compares whether the data in the Python string is equal to the given UTF8.
571///
572/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
573impl PartialEq<str> for &'_ Bound<'_, PyString> {
574    #[inline]
575    fn eq(&self, other: &str) -> bool {
576        self.as_borrowed() == other
577    }
578}
579
580/// Compares whether the data in the Python string is equal to the given UTF8.
581///
582/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
583impl PartialEq<str> for Borrowed<'_, '_, PyString> {
584    #[inline]
585    fn eq(&self, other: &str) -> bool {
586        #[cfg(not(Py_3_13))]
587        {
588            self.to_cow().map_or(false, |s| s == other)
589        }
590
591        #[cfg(Py_3_13)]
592        unsafe {
593            ffi::PyUnicode_EqualToUTF8AndSize(
594                self.as_ptr(),
595                other.as_ptr().cast(),
596                other.len() as _,
597            ) == 1
598        }
599    }
600}
601
602/// Compares whether the data in the Python string is equal to the given UTF8.
603///
604/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
605impl PartialEq<&str> for Borrowed<'_, '_, PyString> {
606    #[inline]
607    fn eq(&self, other: &&str) -> bool {
608        *self == **other
609    }
610}
611
612/// Compares whether the data in the Python string is equal to the given UTF8.
613///
614/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
615impl PartialEq<Borrowed<'_, '_, PyString>> for str {
616    #[inline]
617    fn eq(&self, other: &Borrowed<'_, '_, PyString>) -> bool {
618        other == self
619    }
620}
621
622/// Compares whether the data in the Python string is equal to the given UTF8.
623///
624/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
625impl PartialEq<Borrowed<'_, '_, PyString>> for &'_ str {
626    #[inline]
627    fn eq(&self, other: &Borrowed<'_, '_, PyString>) -> bool {
628        other == self
629    }
630}
631
632#[cfg(test)]
633mod tests {
634    use super::*;
635    use crate::{PyObject, ToPyObject};
636
637    #[test]
638    fn test_to_cow_utf8() {
639        Python::with_gil(|py| {
640            let s = "ascii 🐈";
641            let py_string = PyString::new_bound(py, s);
642            assert_eq!(s, py_string.to_cow().unwrap());
643        })
644    }
645
646    #[test]
647    fn test_to_cow_surrogate() {
648        Python::with_gil(|py| {
649            let py_string = py
650                .eval_bound(r"'\ud800'", None, None)
651                .unwrap()
652                .downcast_into::<PyString>()
653                .unwrap();
654            assert!(py_string.to_cow().is_err());
655        })
656    }
657
658    #[test]
659    fn test_to_cow_unicode() {
660        Python::with_gil(|py| {
661            let s = "哈哈🐈";
662            let py_string = PyString::new_bound(py, s);
663            assert_eq!(s, py_string.to_cow().unwrap());
664        })
665    }
666
667    #[test]
668    fn test_encode_utf8_unicode() {
669        Python::with_gil(|py| {
670            let s = "哈哈🐈";
671            let obj = PyString::new_bound(py, s);
672            assert_eq!(s.as_bytes(), obj.encode_utf8().unwrap().as_bytes());
673        })
674    }
675
676    #[test]
677    fn test_encode_utf8_surrogate() {
678        Python::with_gil(|py| {
679            let obj: PyObject = py.eval_bound(r"'\ud800'", None, None).unwrap().into();
680            assert!(obj
681                .bind(py)
682                .downcast::<PyString>()
683                .unwrap()
684                .encode_utf8()
685                .is_err());
686        })
687    }
688
689    #[test]
690    fn test_to_string_lossy() {
691        Python::with_gil(|py| {
692            let py_string = py
693                .eval_bound(r"'🐈 Hello \ud800World'", None, None)
694                .unwrap()
695                .downcast_into::<PyString>()
696                .unwrap();
697
698            assert_eq!(py_string.to_string_lossy(), "🐈 Hello ���World");
699        })
700    }
701
702    #[test]
703    fn test_debug_string() {
704        Python::with_gil(|py| {
705            let v = "Hello\n".to_object(py);
706            let s = v.downcast_bound::<PyString>(py).unwrap();
707            assert_eq!(format!("{:?}", s), "'Hello\\n'");
708        })
709    }
710
711    #[test]
712    fn test_display_string() {
713        Python::with_gil(|py| {
714            let v = "Hello\n".to_object(py);
715            let s = v.downcast_bound::<PyString>(py).unwrap();
716            assert_eq!(format!("{}", s), "Hello\n");
717        })
718    }
719
720    #[test]
721    #[cfg(not(any(Py_LIMITED_API, PyPy)))]
722    fn test_string_data_ucs1() {
723        Python::with_gil(|py| {
724            let s = PyString::new_bound(py, "hello, world");
725            let data = unsafe { s.data().unwrap() };
726
727            assert_eq!(data, PyStringData::Ucs1(b"hello, world"));
728            assert_eq!(data.to_string(py).unwrap(), Cow::Borrowed("hello, world"));
729            assert_eq!(data.to_string_lossy(), Cow::Borrowed("hello, world"));
730        })
731    }
732
733    #[test]
734    #[cfg(not(any(Py_LIMITED_API, PyPy)))]
735    fn test_string_data_ucs1_invalid() {
736        Python::with_gil(|py| {
737            // 0xfe is not allowed in UTF-8.
738            let buffer = b"f\xfe\0";
739            let ptr = unsafe {
740                crate::ffi::PyUnicode_FromKindAndData(
741                    crate::ffi::PyUnicode_1BYTE_KIND as _,
742                    buffer.as_ptr().cast(),
743                    2,
744                )
745            };
746            assert!(!ptr.is_null());
747            let s = unsafe { ptr.assume_owned(py).downcast_into_unchecked::<PyString>() };
748            let data = unsafe { s.data().unwrap() };
749            assert_eq!(data, PyStringData::Ucs1(b"f\xfe"));
750            let err = data.to_string(py).unwrap_err();
751            assert!(err
752                .get_type_bound(py)
753                .is(&py.get_type_bound::<PyUnicodeDecodeError>()));
754            assert!(err
755                .to_string()
756                .contains("'utf-8' codec can't decode byte 0xfe in position 1"));
757            assert_eq!(data.to_string_lossy(), Cow::Borrowed("f�"));
758        });
759    }
760
761    #[test]
762    #[cfg(not(any(Py_LIMITED_API, PyPy)))]
763    fn test_string_data_ucs2() {
764        Python::with_gil(|py| {
765            let s = py.eval_bound("'foo\\ud800'", None, None).unwrap();
766            let py_string = s.downcast::<PyString>().unwrap();
767            let data = unsafe { py_string.data().unwrap() };
768
769            assert_eq!(data, PyStringData::Ucs2(&[102, 111, 111, 0xd800]));
770            assert_eq!(
771                data.to_string_lossy(),
772                Cow::Owned::<str>("foo�".to_string())
773            );
774        })
775    }
776
777    #[test]
778    #[cfg(all(not(any(Py_LIMITED_API, PyPy)), target_endian = "little"))]
779    fn test_string_data_ucs2_invalid() {
780        Python::with_gil(|py| {
781            // U+FF22 (valid) & U+d800 (never valid)
782            let buffer = b"\x22\xff\x00\xd8\x00\x00";
783            let ptr = unsafe {
784                crate::ffi::PyUnicode_FromKindAndData(
785                    crate::ffi::PyUnicode_2BYTE_KIND as _,
786                    buffer.as_ptr().cast(),
787                    2,
788                )
789            };
790            assert!(!ptr.is_null());
791            let s = unsafe { ptr.assume_owned(py).downcast_into_unchecked::<PyString>() };
792            let data = unsafe { s.data().unwrap() };
793            assert_eq!(data, PyStringData::Ucs2(&[0xff22, 0xd800]));
794            let err = data.to_string(py).unwrap_err();
795            assert!(err
796                .get_type_bound(py)
797                .is(&py.get_type_bound::<PyUnicodeDecodeError>()));
798            assert!(err
799                .to_string()
800                .contains("'utf-16' codec can't decode bytes in position 0-3"));
801            assert_eq!(data.to_string_lossy(), Cow::Owned::<str>("B�".into()));
802        });
803    }
804
805    #[test]
806    #[cfg(not(any(Py_LIMITED_API, PyPy)))]
807    fn test_string_data_ucs4() {
808        Python::with_gil(|py| {
809            let s = "哈哈🐈";
810            let py_string = PyString::new_bound(py, s);
811            let data = unsafe { py_string.data().unwrap() };
812
813            assert_eq!(data, PyStringData::Ucs4(&[21704, 21704, 128008]));
814            assert_eq!(data.to_string_lossy(), Cow::Owned::<str>(s.to_string()));
815        })
816    }
817
818    #[test]
819    #[cfg(all(not(any(Py_LIMITED_API, PyPy)), target_endian = "little"))]
820    fn test_string_data_ucs4_invalid() {
821        Python::with_gil(|py| {
822            // U+20000 (valid) & U+d800 (never valid)
823            let buffer = b"\x00\x00\x02\x00\x00\xd8\x00\x00\x00\x00\x00\x00";
824            let ptr = unsafe {
825                crate::ffi::PyUnicode_FromKindAndData(
826                    crate::ffi::PyUnicode_4BYTE_KIND as _,
827                    buffer.as_ptr().cast(),
828                    2,
829                )
830            };
831            assert!(!ptr.is_null());
832            let s = unsafe { ptr.assume_owned(py).downcast_into_unchecked::<PyString>() };
833            let data = unsafe { s.data().unwrap() };
834            assert_eq!(data, PyStringData::Ucs4(&[0x20000, 0xd800]));
835            let err = data.to_string(py).unwrap_err();
836            assert!(err
837                .get_type_bound(py)
838                .is(&py.get_type_bound::<PyUnicodeDecodeError>()));
839            assert!(err
840                .to_string()
841                .contains("'utf-32' codec can't decode bytes in position 0-7"));
842            assert_eq!(data.to_string_lossy(), Cow::Owned::<str>("𠀀�".into()));
843        });
844    }
845
846    #[test]
847    fn test_intern_string() {
848        Python::with_gil(|py| {
849            let py_string1 = PyString::intern_bound(py, "foo");
850            assert_eq!(py_string1, "foo");
851
852            let py_string2 = PyString::intern_bound(py, "foo");
853            assert_eq!(py_string2, "foo");
854
855            assert_eq!(py_string1.as_ptr(), py_string2.as_ptr());
856
857            let py_string3 = PyString::intern_bound(py, "bar");
858            assert_eq!(py_string3, "bar");
859
860            assert_ne!(py_string1.as_ptr(), py_string3.as_ptr());
861        });
862    }
863
864    #[test]
865    fn test_py_to_str_utf8() {
866        Python::with_gil(|py| {
867            let s = "ascii 🐈";
868            let py_string: Py<PyString> = PyString::new_bound(py, s).into_py(py);
869
870            #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
871            assert_eq!(s, py_string.to_str(py).unwrap());
872
873            assert_eq!(s, py_string.to_cow(py).unwrap());
874        })
875    }
876
877    #[test]
878    fn test_py_to_str_surrogate() {
879        Python::with_gil(|py| {
880            let py_string: Py<PyString> = py
881                .eval_bound(r"'\ud800'", None, None)
882                .unwrap()
883                .extract()
884                .unwrap();
885
886            #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
887            assert!(py_string.to_str(py).is_err());
888
889            assert!(py_string.to_cow(py).is_err());
890        })
891    }
892
893    #[test]
894    fn test_py_to_string_lossy() {
895        Python::with_gil(|py| {
896            let py_string: Py<PyString> = py
897                .eval_bound(r"'🐈 Hello \ud800World'", None, None)
898                .unwrap()
899                .extract()
900                .unwrap();
901            assert_eq!(py_string.to_string_lossy(py), "🐈 Hello ���World");
902        })
903    }
904
905    #[test]
906    fn test_comparisons() {
907        Python::with_gil(|py| {
908            let s = "hello, world";
909            let py_string = PyString::new_bound(py, s);
910
911            assert_eq!(py_string, "hello, world");
912
913            assert_eq!(py_string, s);
914            assert_eq!(&py_string, s);
915            assert_eq!(s, py_string);
916            assert_eq!(s, &py_string);
917
918            assert_eq!(py_string, *s);
919            assert_eq!(&py_string, *s);
920            assert_eq!(*s, py_string);
921            assert_eq!(*s, &py_string);
922
923            let py_string = py_string.as_borrowed();
924
925            assert_eq!(py_string, s);
926            assert_eq!(&py_string, s);
927            assert_eq!(s, py_string);
928            assert_eq!(s, &py_string);
929
930            assert_eq!(py_string, *s);
931            assert_eq!(*s, py_string);
932        })
933    }
934}