Skip to content

Commit

Permalink
fix missing unicode map entries
Browse files Browse the repository at this point in the history
  • Loading branch information
Andre Bogus committed Aug 12, 2020
1 parent d5c755f commit d3ed8ce
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 41 deletions.
2 changes: 1 addition & 1 deletion examples/extract.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,5 @@ fn main() {
_ => panic!(),
};

output_doc(&doc, output.as_mut());
output_doc(&doc, output.as_mut()).unwrap();
}
116 changes: 76 additions & 40 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,7 @@ impl<'a> PdfSimpleFont<'a> {
}
dlog!("{} = {} ({:?})", code, name, unicode);
if let Some(ref mut unicode_map) = unicode_map {
dlog!("{} {}", code, unicode_map[&(code as u32)]);
dlog!("{} {:?}", code, unicode_map.get(&(code as u32)));
}
code += 1;
}
Expand Down Expand Up @@ -630,7 +630,7 @@ impl<'a> PdfType3Font<'a> {
}
dlog!("{} = {} ({:?})", code, name, unicode);
if let Some(ref unicode_map) = unicode_map {
dlog!("{} {}", code, unicode_map.get(&(code as u32)));
dlog!("{} {:?}", code, unicode_map.get(&(code as u32)));
}
code += 1;
}
Expand Down Expand Up @@ -710,7 +710,7 @@ impl<'a> PdfFont for PdfSimpleFont<'a> {
return *width;
} else {
dlog!("missing width for {} falling back to default_width {:?}", id, self.font);
return self.default_width.unwrap();
return self.default_width.unwrap_or_default();
}
}
/*fn decode(&self, chars: &[u8]) -> String {
Expand All @@ -721,12 +721,18 @@ impl<'a> PdfFont for PdfSimpleFont<'a> {
fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
iter.next().map(|x| (*x as CharCode, 1))
}
fn decode_char(&self, char: CharCode) -> String {
let slice = [char as u8];
fn decode_char(&self, c: CharCode) -> String {
let slice = [c as u8];
if let Some(ref unicode_map) = self.unicode_map {
let s = unicode_map.get(&char);
let s = unicode_map.get(&c);
let s = match s {
None => { panic!("missing char {:?} in map {:?}", char, unicode_map)}
None => {
dlog!("missing char {:?} in map {:?}", c, unicode_map);
// if we don't have the char in the font's map, it may still
// be a "correct" unicode char, so return it. Otherwise return
// the UTF8 replacement char
[std::char::from_u32(c).unwrap_or_else(|| '\u{fffd}')].iter().collect::<String>()
}
Some(s) => { s.clone() }
};
return s
Expand All @@ -750,9 +756,10 @@ impl<'a> PdfFont for PdfType3Font<'a> {
fn get_width(&self, id: CharCode) -> f64 {
let width = self.widths.get(&id);
if let Some(width) = width {
return *width;
*width
} else {
panic!("missing width for {} {:?}", id, self.font);
dlog!("missing width for {} {:?}", id, self.font);
0.0
}
}
/*fn decode(&self, chars: &[u8]) -> String {
Expand All @@ -763,12 +770,18 @@ impl<'a> PdfFont for PdfType3Font<'a> {
fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
iter.next().map(|x| (*x as CharCode, 1))
}
fn decode_char(&self, char: CharCode) -> String {
let slice = [char as u8];
fn decode_char(&self, c: CharCode) -> String {
let slice = [c as u8];
if let Some(ref unicode_map) = self.unicode_map {
let s = unicode_map.get(&char);
let s = unicode_map.get(&c);
let s = match s {
None => { panic!("missing char {:?} in map {:?}", char, unicode_map)}
None => {
dlog!("missing char {:?} in map {:?}", c, unicode_map);
// if we don't have the char in the font's map, it may still
// be a "correct" unicode char, so return it. Otherwise return
// the UTF8 replacement char
[std::char::from_u32(c).unwrap_or_else(|| '\u{fffd}')].iter().collect::<String>()
},
Some(s) => { s.clone() }
};
return s
Expand Down Expand Up @@ -935,19 +948,28 @@ impl<'a> PdfFont for PdfCIDFont<'a> {
fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
let p = iter.next();
if let Some(&c) = p {
let next = *iter.next().unwrap();
Some((((c as u32) << 8) | next as u32, 2))
if let Some(&next) = iter.next() {
Some((((c as u32) << 8) | next as u32, 2))
} else {
Some((c as u32, 1))
}
} else {
None
}
}
fn decode_char(&self, char: CharCode) -> String {
let s = self.to_unicode.as_ref().and_then(|x| x.get(&char));
if let Some(s) = s {
s.clone()
fn decode_char(&self, c: CharCode) -> String {
if let Some(to_unicode) = self.to_unicode.as_ref() {
if let Some(s) = to_unicode.get(&c) {
s.clone()
} else {
dlog!("Unknown character {:?} in {:?} {:?}", c, self.font, self.to_unicode);
[(c >> 8) & 255, c & 255].iter().map(|c|
to_unicode.get(c).as_ref().map_or("", |s| &s[..])).collect()
}
} else {
dlog!("Unknown character {:?} in {:?} {:?}", char, self.font, self.to_unicode);
"".to_string()
dlog!("Unknown character {:?} in {:?} {:?}", c, self.font, self.to_unicode);
std::str::from_utf8(&[(c >> 16) as u8, (c >> 8) as u8, c as u8])
.unwrap_or("\u{fffd}").to_owned()
}
}
}
Expand Down Expand Up @@ -1272,6 +1294,17 @@ pub enum ColorSpace {
ICCBased(Vec<u8>)
}

macro_rules! colorspace_arg {
($ok:ident, $e:expr, $($format:expr),*) => {
if let $ok(v) = $e {
v
} else {
dlog!($($format),*);
return ColorSpace::DeviceRGB
}
};
}

fn make_colorspace<'a>(doc: &'a Document, name: &[u8], resources: &'a Dictionary) -> ColorSpace {
match name {
b"DeviceGray" => ColorSpace::DeviceGray,
Expand All @@ -1280,12 +1313,12 @@ fn make_colorspace<'a>(doc: &'a Document, name: &[u8], resources: &'a Dictionary
b"Pattern" => ColorSpace::Pattern,
_ => {
let colorspaces: &Dictionary = get(&doc, resources, b"ColorSpace");
let cs = maybe_get_array(doc, colorspaces, &name[..]).unwrap_or_else(|| panic!("missing colorspace {:?}", &name[..]));
let cs_name = pdf_to_utf8(cs[0].as_name().expect("first arg must be a name"));
let cs = colorspace_arg!(Some, maybe_get_array(doc, colorspaces, &name[..]), "missing colorspace {:?}", &name[..]);
let cs_name = pdf_to_utf8(colorspace_arg!(Ok, cs[0].as_name(), "first arg must be a name"));
match cs_name.as_ref() {
"Separation" => {
let name = pdf_to_utf8(cs[1].as_name().expect("second arg must be a name"));
let alternate_space = pdf_to_utf8(cs[2].as_name().expect("second arg must be a name"));
let name = pdf_to_utf8(colorspace_arg!(Ok, cs[1].as_name(), "second arg must be a name"));
let alternate_space = pdf_to_utf8(colorspace_arg!(Ok, cs[2].as_name(), "second arg must be a name"));
let tint_transform = Box::new(Function::new(doc, maybe_deref(doc, &cs[3])));

dlog!("{:?} {:?} {:?}", name, alternate_space, tint_transform);
Expand All @@ -1298,15 +1331,15 @@ fn make_colorspace<'a>(doc: &'a Document, name: &[u8], resources: &'a Dictionary
ColorSpace::ICCBased(get_contents(stream))
}
"CalGray" => {
let dict = cs[1].as_dict().expect("second arg must be a dict");
let dict = colorspace_arg!(Ok, cs[1].as_dict(), "second arg must be a dict");
ColorSpace::CalGray(CalGray {
white_point: get(&doc, dict, b"WhitePoint"),
black_point: get(&doc, dict, b"BackPoint"),
gamma: get(&doc, dict, b"Gamma"),
})
}
"CalRGB" => {
let dict = cs[1].as_dict().expect("second arg must be a dict");
let dict = colorspace_arg!(Ok, cs[1].as_dict(), "second arg must be a dict");
ColorSpace::CalRGB(CalRGB {
white_point: get(&doc, dict, b"WhitePoint"),
black_point: get(&doc, dict, b"BackPoint"),
Expand All @@ -1315,7 +1348,7 @@ fn make_colorspace<'a>(doc: &'a Document, name: &[u8], resources: &'a Dictionary
})
}
"Lab" => {
let dict = cs[1].as_dict().expect("second arg must be a dict");
let dict = colorspace_arg!(Ok, cs[1].as_dict(), "second arg must be a dict");
ColorSpace::Lab(Lab {
white_point: get(&doc, dict, b"WhitePoint"),
black_point: get(&doc, dict, b"BackPoint"),
Expand All @@ -1326,7 +1359,8 @@ fn make_colorspace<'a>(doc: &'a Document, name: &[u8], resources: &'a Dictionary
ColorSpace::Pattern
}
_ => {
panic!("color_space {:?} {:?} {:?}", name, cs_name, cs)
dlog!("color_space {:?} {:?} {:?}", name, cs_name, cs);
ColorSpace::DeviceRGB
}
}
}
Expand Down Expand Up @@ -1487,7 +1521,7 @@ impl<'a> Processor<'a> {
gs.ts.font = Some(font);

gs.ts.font_size = as_num(&operation.operands[1]);
dlog!("font {} size: {} {:?}", name, gs.ts.font_size, operation);
dlog!("font {:?} size: {} {:?}", name, gs.ts.font_size, operation);
}
"Ts" => {
gs.ts.rise = as_num(&operation.operands[0]);
Expand Down Expand Up @@ -1585,13 +1619,15 @@ impl<'a> Processor<'a> {
as_num(&operation.operands[3])))
}
"y" => {
path.ops.push(PathOp::CurveTo(
as_num(&operation.operands[0]),
as_num(&operation.operands[1]),
as_num(&operation.operands[2]),
as_num(&operation.operands[3]),
as_num(&operation.operands[2]),
as_num(&operation.operands[3])))
if operation.operands.len() > 3 {
path.ops.push(PathOp::CurveTo(
as_num(&operation.operands[0]),
as_num(&operation.operands[1]),
as_num(&operation.operands[2]),
as_num(&operation.operands[3]),
as_num(&operation.operands[2]),
as_num(&operation.operands[3])))
}
}
"h" => { path.ops.push(PathOp::Close) }
"re" => {
Expand Down Expand Up @@ -1923,7 +1959,7 @@ impl<W: ConvertToFmt> OutputDev for PlainTextOutput<W> {
fn end_page(&mut self) -> Result<(), OutputError> {
Ok(())
}
fn output_character(&mut self, trm: &Transform, width: f64, _spacing: f64, font_size: f64, char: &str) -> Result<(), OutputError> {
fn output_character(&mut self, trm: &Transform, width: f64, _spacing: f64, font_size: f64, c: &str) -> Result<(), OutputError> {
let position = trm.post_transform(&self.flip_ctm);
let transformed_font_size_vec = trm.transform_vector(vec2(font_size, font_size));
// get the length of one sized of the square with the same area with a rectangle of size (x, y)
Expand All @@ -1947,7 +1983,7 @@ impl<W: ConvertToFmt> OutputDev for PlainTextOutput<W> {
}
}
//let norm = unicode_normalization::UnicodeNormalization::nfkc(char);
write!(self.writer, "{}", char)?;
write!(self.writer, "{}", c)?;
self.first_char = false;
self.last_y = y;
self.last_end = x + width * transformed_font_size;
Expand All @@ -1970,7 +2006,7 @@ pub fn print_metadata(doc: &Document) {
if let Some(ref info) = get_info(&doc) {
for (k, v) in *info {
match v {
&Object::String(ref s, StringFormat::Literal) => { dlog!("{}: {}", k, pdf_to_utf8(s)); }
&Object::String(ref s, StringFormat::Literal) => { dlog!("{:?}: {}", k, pdf_to_utf8(s)); }
_ => {}
}
}
Expand Down

0 comments on commit d3ed8ce

Please sign in to comment.