dicom_encoding/
text.rs

1//! This module contains reusable components for encoding and decoding text in DICOM
2//! data structures, including support for character repertoires.
3//!
4//! At the moment the following character sets are supported:
5//!
6//! | Character Set                 | decoding support | encoding support |
7//! |-------------------------------|------------------|------------------|
8//! | ISO-IR 6 (default)            | ✓ | ✓ |
9//! | ISO-IR 100 (ISO-8859-1): Right-hand part of the Latin alphabet no. 1, the Western Europe character set | ✓ | ✓ |
10//! | ISO-IR 101 (ISO-8859-2): Right-hand part of the Latin alphabet no. 2, the Central/Eastern Europe character set | ✓ | ✓ |
11//! | ISO-IR 109 (ISO-8859-3): Right-hand part of the Latin alphabet no. 3, the South Europe character set | ✓ | ✓ |
12//! | ISO-IR 110 (ISO-8859-4): Right-hand part of the Latin alphabet no. 4, the North Europe character set | ✓ | ✓ |
13//! | ISO-IR 144 (ISO-8859-5): The Latin/Cyrillic character set | ✓ | ✓ |
14//! | ISO-IR 192: The Unicode character set based on the UTF-8 encoding | ✓ | ✓ |
15//! | GB18030: The Simplified Chinese character set | ✓ | ✓ |
16//! | JIS X 0201-1976: Code for Information Interchange | x | x |
17//! | JIS X 0208-1990: Code for the Japanese Graphic Character set for information interchange | x | x |
18//! | JIS X 0212-1990: Code of the supplementary Japanese Graphic Character set for information interchange | x | x |
19//! | KS X 1001 (registered as ISO-IR 149) for Korean Language | x | x |
20//! | TIS 620-2533 (1990) Thai Characters Code for Information Interchange | x | x |
21//! | GB2312: Simplified Chinese character set | x | x |
22//!
23//! These capabilities are available through [`SpecificCharacterSet`].
24
25use encoding::all::{GB18030, ISO_8859_1, ISO_8859_2, ISO_8859_3, ISO_8859_4, ISO_8859_5, UTF_8};
26use encoding::{DecoderTrap, EncoderTrap, Encoding, RawDecoder, StringWriter};
27use snafu::{Backtrace, Snafu};
28use std::borrow::Cow;
29use std::fmt::Debug;
30
31/// An error type for text encoding issues.
32#[derive(Debug, Snafu)]
33#[non_exhaustive]
34pub enum EncodeTextError {
35    /// A custom error message,
36    /// for when the underlying error type does not encode error semantics
37    /// into type variants.
38    #[snafu(display("{}", message))]
39    EncodeCustom {
40        /// The error message in plain text.
41        message: Cow<'static, str>,
42        /// The generated backtrace, if available.
43        backtrace: Backtrace,
44    },
45}
46
47/// An error type for text decoding issues.
48#[derive(Debug, Snafu)]
49#[non_exhaustive]
50pub enum DecodeTextError {
51    /// A custom error message,
52    /// for when the underlying error type does not encode error semantics
53    /// into type variants.
54    #[snafu(display("{}", message))]
55    DecodeCustom {
56        /// The error message in plain text.
57        message: Cow<'static, str>,
58        /// The generated backtrace, if available.
59        backtrace: Backtrace,
60    },
61}
62
63type EncodeResult<T> = Result<T, EncodeTextError>;
64type DecodeResult<T> = Result<T, DecodeTextError>;
65
66/// A holder of encoding and decoding mechanisms for text in DICOM content,
67/// which according to the standard, depends on the specific character set.
68pub trait TextCodec {
69    /// Obtain the defined term (unique name) of the text encoding,
70    /// which may be used as the value of a
71    /// Specific Character Set (0008, 0005) element to refer to this codec.
72    ///
73    /// Should contain no leading or trailing spaces.
74    /// This method may be useful for testing purposes, considering that
75    /// `TextCodec` is often used as a trait object.
76    fn name(&self) -> Cow<'static, str>;
77
78    /// Decode the given byte buffer as a single string. The resulting string
79    /// _may_ contain backslash characters ('\') to delimit individual values,
80    /// and should be split later on if required.
81    fn decode(&self, text: &[u8]) -> DecodeResult<String>;
82
83    /// Encode a text value into a byte vector. The input string can
84    /// feature multiple text values by using the backslash character ('\')
85    /// as the value delimiter.
86    fn encode(&self, text: &str) -> EncodeResult<Vec<u8>>;
87}
88
89impl<T: ?Sized> TextCodec for Box<T>
90where
91    T: TextCodec,
92{
93    fn name(&self) -> Cow<'static, str> {
94        self.as_ref().name()
95    }
96
97    fn decode(&self, text: &[u8]) -> DecodeResult<String> {
98        self.as_ref().decode(text)
99    }
100
101    fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
102        self.as_ref().encode(text)
103    }
104}
105
106impl<'a, T: ?Sized> TextCodec for &'a T
107where
108    T: TextCodec,
109{
110    fn name(&self) -> Cow<'static, str> {
111        (**self).name()
112    }
113
114    fn decode(&self, text: &[u8]) -> DecodeResult<String> {
115        (**self).decode(text)
116    }
117
118    fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
119        (**self).encode(text)
120    }
121}
122
123/// A descriptor for a specific character set,
124/// taking part in text encoding and decoding
125/// as per [PS3.5 ch 6 6.1](https://dicom.nema.org/medical/dicom/2023e/output/chtml/part05/chapter_6.html#sect_6.1).
126///
127/// # Example
128///
129/// Use [`from_code`](SpecificCharacterSet::from_code)
130/// or one of the associated constants to create a character set.
131/// From there, use the [`TextCodec`] trait to encode and decode text.
132///
133/// ```
134/// use dicom_encoding::text::{SpecificCharacterSet, TextCodec};
135///
136/// let character_set = SpecificCharacterSet::from_code("ISO_IR 100").unwrap();
137/// assert_eq!(character_set, SpecificCharacterSet::ISO_IR_100);
138/// ```
139#[derive(Debug, Default, Clone, PartialEq)]
140pub struct SpecificCharacterSet(CharsetImpl);
141
142impl SpecificCharacterSet {
143    /// ISO IR 6: The default character set, as defined by the DICOM standard.
144    pub const ISO_IR_6: SpecificCharacterSet = SpecificCharacterSet(CharsetImpl::Default);
145
146    // ISO IR 100: ISO 8859-1, the Western Europe character set
147    pub const ISO_IR_100: SpecificCharacterSet = SpecificCharacterSet(CharsetImpl::IsoIr100);
148
149    /// ISO IR 192: UTF-8 encoding
150    pub const ISO_IR_192: SpecificCharacterSet = SpecificCharacterSet(CharsetImpl::IsoIr192);
151
152    /// Obtain the specific character set identified by the given code string.
153    ///
154    /// Supported code strings include the possible values
155    /// in the respective DICOM element (0008, 0005).
156    ///
157    /// # Example
158    ///
159    /// ```
160    /// use dicom_encoding::text::{SpecificCharacterSet, TextCodec};
161    ///
162    /// let character_set = SpecificCharacterSet::from_code("ISO_IR 100").unwrap();
163    /// assert_eq!(character_set.name(), "ISO_IR 100");
164    /// ```
165    pub fn from_code(code: &str) -> Option<Self> {
166        CharsetImpl::from_code(code).map(SpecificCharacterSet)
167    }
168}
169
170impl TextCodec for SpecificCharacterSet {
171    fn name(&self) -> Cow<'static, str> {
172        self.0.name()
173    }
174
175    fn decode(&self, text: &[u8]) -> DecodeResult<String> {
176        self.0.decode(text)
177    }
178
179    fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
180        self.0.encode(text)
181    }
182}
183
184/// An enum type for individual supported character sets.
185#[derive(Debug, Default, Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
186#[non_exhaustive]
187enum CharsetImpl {
188    /// **ISO-IR 6**: the default character set.
189    #[default]
190    Default,
191    /// **ISO-IR 100** (ISO-8859-1): Right-hand part of the Latin alphabet no. 1,
192    /// the Western Europe character set.
193    IsoIr100,
194    /// **ISO-IR 101** (ISO-8859-2): Right-hand part of the Latin alphabet no. 2,
195    /// the Central/Eastern Europe character set.
196    IsoIr101,
197    /// **ISO-IR 109** (ISO-8859-3): Right-hand part of the Latin alphabet no. 3,
198    /// the South Europe character set.
199    IsoIr109,
200    /// **ISO-IR 110** (ISO-8859-4): Right-hand part of the Latin alphabet no. 4,
201    /// the North Europe character set.
202    IsoIr110,
203    /// **ISO-IR 144** (ISO-8859-5): The Latin/Cyrillic character set.
204    IsoIr144,
205    /// **ISO-IR 192**: The Unicode character set based on the UTF-8 encoding.
206    IsoIr192,
207    /// **GB18030**: The Simplified Chinese character set.
208    Gb18030,
209    // Support for more text encodings is tracked in issue #40.
210}
211
212impl CharsetImpl {
213    /// Obtain the specific character set identified by the given code string.
214    ///
215    /// Supported code strings include the possible values
216    /// in the respective DICOM element (0008, 0005).
217    pub fn from_code(uid: &str) -> Option<Self> {
218        use self::CharsetImpl::*;
219        match uid.trim_end() {
220            "Default" | "ISO_IR_6" | "ISO_IR 6" | "ISO 2022 IR 6" => Some(Default),
221            "ISO_IR_100" | "ISO_IR 100" | "ISO 2022 IR 100" => Some(IsoIr100),
222            "ISO_IR_101" | "ISO_IR 101" | "ISO 2022 IR 101" => Some(IsoIr101),
223            "ISO_IR_109" | "ISO_IR 109" | "ISO 2022 IR 109" => Some(IsoIr109),
224            "ISO_IR_110" | "ISO_IR 110" | "ISO 2022 IR 110" => Some(IsoIr110),
225            "ISO_IR_144" | "ISO_IR 144" | "ISO 2022 IR 144" => Some(IsoIr144),
226            "ISO_IR_192" | "ISO_IR 192" => Some(IsoIr192),
227            "GB18030" => Some(Gb18030),
228            _ => None,
229        }
230    }
231}
232
233impl TextCodec for CharsetImpl {
234    fn name(&self) -> Cow<'static, str> {
235        Cow::Borrowed(match self {
236            CharsetImpl::Default => "ISO_IR 6",
237            CharsetImpl::IsoIr100 => "ISO_IR 100",
238            CharsetImpl::IsoIr101 => "ISO_IR 101",
239            CharsetImpl::IsoIr109 => "ISO_IR 109",
240            CharsetImpl::IsoIr110 => "ISO_IR 110",
241            CharsetImpl::IsoIr144 => "ISO_IR 144",
242            CharsetImpl::IsoIr192 => "ISO_IR 192",
243            CharsetImpl::Gb18030 => "GB18030",
244        })
245    }
246
247    fn decode(&self, text: &[u8]) -> DecodeResult<String> {
248        match self {
249            CharsetImpl::Default => DefaultCharacterSetCodec.decode(text),
250            CharsetImpl::IsoIr100 => IsoIr100CharacterSetCodec.decode(text),
251            CharsetImpl::IsoIr101 => IsoIr101CharacterSetCodec.decode(text),
252            CharsetImpl::IsoIr109 => IsoIr109CharacterSetCodec.decode(text),
253            CharsetImpl::IsoIr110 => IsoIr110CharacterSetCodec.decode(text),
254            CharsetImpl::IsoIr144 => IsoIr144CharacterSetCodec.decode(text),
255            CharsetImpl::IsoIr192 => Utf8CharacterSetCodec.decode(text),
256            CharsetImpl::Gb18030 => Gb18030CharacterSetCodec.decode(text),
257        }
258    }
259
260    fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
261        match self {
262            CharsetImpl::Default => DefaultCharacterSetCodec.encode(text),
263            CharsetImpl::IsoIr100 => IsoIr100CharacterSetCodec.encode(text),
264            CharsetImpl::IsoIr101 => IsoIr101CharacterSetCodec.encode(text),
265            CharsetImpl::IsoIr109 => IsoIr109CharacterSetCodec.encode(text),
266            CharsetImpl::IsoIr110 => IsoIr110CharacterSetCodec.encode(text),
267            CharsetImpl::IsoIr144 => IsoIr144CharacterSetCodec.encode(text),
268            CharsetImpl::IsoIr192 => Utf8CharacterSetCodec.encode(text),
269            CharsetImpl::Gb18030 => Gb18030CharacterSetCodec.encode(text),
270        }
271    }
272}
273
274fn decode_text_trap(
275    _decoder: &mut dyn RawDecoder,
276    input: &[u8],
277    output: &mut dyn StringWriter,
278) -> bool {
279    let c = input[0];
280    let o0 = c & 7;
281    let o1 = (c & 56) >> 3;
282    let o2 = (c & 192) >> 6;
283    output.write_char('\\');
284    output.write_char((o2 + b'0') as char);
285    output.write_char((o1 + b'0') as char);
286    output.write_char((o0 + b'0') as char);
287    true
288}
289
290/// Create and implement a character set type using the `encoding` crate.
291macro_rules! decl_character_set {
292    ($typ: ident, $term: literal, $val: expr) => {
293        #[derive(Debug, Default, Copy, Clone, Eq, Hash, PartialEq)]
294        #[doc = "Data type for the "]
295        #[doc = $term]
296        #[doc = "character set encoding."]
297        pub struct $typ;
298
299        impl TextCodec for $typ {
300            fn name(&self) -> Cow<'static, str> {
301                Cow::Borrowed($term)
302            }
303
304            fn decode(&self, text: &[u8]) -> DecodeResult<String> {
305                $val.decode(text, DecoderTrap::Call(decode_text_trap))
306                    .map_err(|message| DecodeCustomSnafu { message }.build())
307            }
308
309            fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
310                $val.encode(text, EncoderTrap::Strict)
311                    .map_err(|message| EncodeCustomSnafu { message }.build())
312            }
313        }
314    };
315}
316
317/// Data type representing the default character set.
318#[derive(Debug, Default, Copy, Clone, Eq, Hash, PartialEq)]
319pub struct DefaultCharacterSetCodec;
320
321impl TextCodec for DefaultCharacterSetCodec {
322    fn name(&self) -> Cow<'static, str> {
323        Cow::Borrowed("ISO_IR 6")
324    }
325
326    fn decode(&self, text: &[u8]) -> DecodeResult<String> {
327        // Using 8859-1 because it is a superset. Reiterations of this impl
328        // should check for invalid character codes (#40).
329        ISO_8859_1
330            .decode(text, DecoderTrap::Call(decode_text_trap))
331            .map_err(|message| DecodeCustomSnafu { message }.build())
332    }
333
334    fn encode(&self, text: &str) -> EncodeResult<Vec<u8>> {
335        ISO_8859_1
336            .encode(text, EncoderTrap::Strict)
337            .map_err(|message| EncodeCustomSnafu { message }.build())
338    }
339}
340
341decl_character_set!(IsoIr100CharacterSetCodec, "ISO_IR 100", ISO_8859_1);
342decl_character_set!(IsoIr101CharacterSetCodec, "ISO_IR 101", ISO_8859_2);
343decl_character_set!(IsoIr109CharacterSetCodec, "ISO_IR 109", ISO_8859_3);
344decl_character_set!(IsoIr110CharacterSetCodec, "ISO_IR 110", ISO_8859_4);
345decl_character_set!(IsoIr144CharacterSetCodec, "ISO_IR 144", ISO_8859_5);
346decl_character_set!(Utf8CharacterSetCodec, "ISO_IR 192", UTF_8);
347decl_character_set!(Gb18030CharacterSetCodec, "GB18030", GB18030);
348
349/// The result of a text validation procedure (please see [`validate_iso_8859`]).
350#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
351pub enum TextValidationOutcome {
352    /// The text is fully valid and can be safely decoded.
353    Ok,
354    /// Some characters may have to be replaced, other than that the text can be safely decoded.
355    BadCharacters,
356    /// The text cannot be decoded.
357    NotOk,
358}
359
360/// Check whether the given byte slice contains valid text from the default character repertoire.
361pub fn validate_iso_8859(text: &[u8]) -> TextValidationOutcome {
362    if ISO_8859_1.decode(text, DecoderTrap::Strict).is_err() {
363        match ISO_8859_1.decode(text, DecoderTrap::Call(decode_text_trap)) {
364            Ok(_) => TextValidationOutcome::BadCharacters,
365            Err(_) => TextValidationOutcome::NotOk,
366        }
367    } else {
368        TextValidationOutcome::Ok
369    }
370}
371
372/// Check whether the given byte slice contains only valid characters for a
373/// Date value representation.
374pub fn validate_da(text: &[u8]) -> TextValidationOutcome {
375    if text.iter().cloned().all(|c| c.is_ascii_digit()) {
376        TextValidationOutcome::Ok
377    } else {
378        TextValidationOutcome::NotOk
379    }
380}
381
382/// Check whether the given byte slice contains only valid characters for a
383/// Time value representation.
384pub fn validate_tm(text: &[u8]) -> TextValidationOutcome {
385    if text.iter().cloned().all(|c| match c {
386        b'\\' | b'.' | b'-' | b' ' => true,
387        c => c.is_ascii_digit(),
388    }) {
389        TextValidationOutcome::Ok
390    } else {
391        TextValidationOutcome::NotOk
392    }
393}
394
395/// Check whether the given byte slice contains only valid characters for a
396/// Date Time value representation.
397pub fn validate_dt(text: &[u8]) -> TextValidationOutcome {
398    if text.iter().cloned().all(|c| match c {
399        b'.' | b'-' | b'+' | b' ' | b'\\' => true,
400        c => c.is_ascii_digit(),
401    }) {
402        TextValidationOutcome::Ok
403    } else {
404        TextValidationOutcome::NotOk
405    }
406}
407
408/// Check whether the given byte slice contains only valid characters for a
409/// Code String value representation.
410pub fn validate_cs(text: &[u8]) -> TextValidationOutcome {
411    if text.iter().cloned().all(|c| match c {
412        b' ' | b'_' => true,
413        c => c.is_ascii_digit() || c.is_ascii_uppercase(),
414    }) {
415        TextValidationOutcome::Ok
416    } else {
417        TextValidationOutcome::NotOk
418    }
419}
420
421#[cfg(test)]
422mod tests {
423    use super::*;
424
425    fn test_codec<T>(codec: T, string: &str, bytes: &[u8])
426    where
427        T: TextCodec,
428    {
429        assert_eq!(codec.encode(string).expect("encoding"), bytes);
430        assert_eq!(codec.decode(bytes).expect("decoding"), string);
431    }
432
433    #[test]
434    fn iso_ir_6_baseline() {
435        let codec = SpecificCharacterSet::default();
436        test_codec(codec, "Smith^John", b"Smith^John");
437    }
438
439    #[test]
440    fn iso_ir_192_baseline() {
441        let codec = SpecificCharacterSet::ISO_IR_192;
442        test_codec(&codec, "Simões^John", "Simões^John".as_bytes());
443        test_codec(codec, "Иванков^Андрей", "Иванков^Андрей".as_bytes());
444    }
445
446    #[test]
447    fn iso_ir_100_baseline() {
448        let codec = SpecificCharacterSet(CharsetImpl::IsoIr100);
449        test_codec(&codec, "Simões^João", b"Sim\xF5es^Jo\xE3o");
450        test_codec(codec, "Günther^Hans", b"G\xfcnther^Hans");
451    }
452
453    #[test]
454    fn iso_ir_101_baseline() {
455        let codec = SpecificCharacterSet(CharsetImpl::IsoIr101);
456        test_codec(codec, "Günther^Hans", b"G\xfcnther^Hans");
457    }
458
459    #[test]
460    fn iso_ir_144_baseline() {
461        let codec = SpecificCharacterSet(CharsetImpl::IsoIr144);
462        test_codec(
463            codec,
464            "Иванков^Андрей",
465            b"\xb8\xd2\xd0\xdd\xda\xde\xd2^\xb0\xdd\xd4\xe0\xd5\xd9",
466        );
467    }
468}