Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix missing unicode map entries #22

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/extract.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,5 @@ fn main() {
_ => panic!(),
};

output_doc(&doc, output.as_mut());
output_doc(&doc, output.as_mut()).unwrap();
}
116 changes: 76 additions & 40 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,7 @@ impl<'a> PdfSimpleFont<'a> {
}
dlog!("{} = {} ({:?})", code, name, unicode);
if let Some(ref mut unicode_map) = unicode_map {
dlog!("{} {}", code, unicode_map[&(code as u32)]);
dlog!("{} {:?}", code, unicode_map.get(&(code as u32)));
}
code += 1;
}
Expand Down Expand Up @@ -630,7 +630,7 @@ impl<'a> PdfType3Font<'a> {
}
dlog!("{} = {} ({:?})", code, name, unicode);
if let Some(ref unicode_map) = unicode_map {
dlog!("{} {}", code, unicode_map.get(&(code as u32)));
dlog!("{} {:?}", code, unicode_map.get(&(code as u32)));
}
code += 1;
}
Expand Down Expand Up @@ -710,7 +710,7 @@ impl<'a> PdfFont for PdfSimpleFont<'a> {
return *width;
} else {
dlog!("missing width for {} falling back to default_width {:?}", id, self.font);
return self.default_width.unwrap();
return self.default_width.unwrap_or_default();
}
}
/*fn decode(&self, chars: &[u8]) -> String {
Expand All @@ -721,12 +721,18 @@ impl<'a> PdfFont for PdfSimpleFont<'a> {
fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
iter.next().map(|x| (*x as CharCode, 1))
}
fn decode_char(&self, char: CharCode) -> String {
let slice = [char as u8];
fn decode_char(&self, c: CharCode) -> String {
let slice = [c as u8];
if let Some(ref unicode_map) = self.unicode_map {
let s = unicode_map.get(&char);
let s = unicode_map.get(&c);
let s = match s {
None => { panic!("missing char {:?} in map {:?}", char, unicode_map)}
None => {
dlog!("missing char {:?} in map {:?}", c, unicode_map);
// if we don't have the char in the font's map, it may still
// be a "correct" unicode char, so return it. Otherwise return
// the UTF8 replacement char
[std::char::from_u32(c).unwrap_or_else(|| '\u{fffd}')].iter().collect::<String>()
}
Some(s) => { s.clone() }
};
return s
Expand All @@ -750,9 +756,10 @@ impl<'a> PdfFont for PdfType3Font<'a> {
fn get_width(&self, id: CharCode) -> f64 {
let width = self.widths.get(&id);
if let Some(width) = width {
return *width;
*width
} else {
panic!("missing width for {} {:?}", id, self.font);
dlog!("missing width for {} {:?}", id, self.font);
0.0
}
}
/*fn decode(&self, chars: &[u8]) -> String {
Expand All @@ -763,12 +770,18 @@ impl<'a> PdfFont for PdfType3Font<'a> {
fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
iter.next().map(|x| (*x as CharCode, 1))
}
fn decode_char(&self, char: CharCode) -> String {
let slice = [char as u8];
fn decode_char(&self, c: CharCode) -> String {
let slice = [c as u8];
if let Some(ref unicode_map) = self.unicode_map {
let s = unicode_map.get(&char);
let s = unicode_map.get(&c);
let s = match s {
None => { panic!("missing char {:?} in map {:?}", char, unicode_map)}
None => {
dlog!("missing char {:?} in map {:?}", c, unicode_map);
// if we don't have the char in the font's map, it may still
// be a "correct" unicode char, so return it. Otherwise return
// the UTF8 replacement char
[std::char::from_u32(c).unwrap_or_else(|| '\u{fffd}')].iter().collect::<String>()
},
Some(s) => { s.clone() }
};
return s
Expand Down Expand Up @@ -935,19 +948,28 @@ impl<'a> PdfFont for PdfCIDFont<'a> {
fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
let p = iter.next();
if let Some(&c) = p {
let next = *iter.next().unwrap();
Some((((c as u32) << 8) | next as u32, 2))
if let Some(&next) = iter.next() {
Some((((c as u32) << 8) | next as u32, 2))
} else {
Some((c as u32, 1))
}
} else {
None
}
}
fn decode_char(&self, char: CharCode) -> String {
let s = self.to_unicode.as_ref().and_then(|x| x.get(&char));
if let Some(s) = s {
s.clone()
fn decode_char(&self, c: CharCode) -> String {
if let Some(to_unicode) = self.to_unicode.as_ref() {
if let Some(s) = to_unicode.get(&c) {
s.clone()
} else {
dlog!("Unknown character {:?} in {:?} {:?}", c, self.font, self.to_unicode);
[(c >> 8) & 255, c & 255].iter().map(|c|
to_unicode.get(c).as_ref().map_or("", |s| &s[..])).collect()
}
} else {
dlog!("Unknown character {:?} in {:?} {:?}", char, self.font, self.to_unicode);
"".to_string()
dlog!("Unknown character {:?} in {:?} {:?}", c, self.font, self.to_unicode);
std::str::from_utf8(&[(c >> 16) as u8, (c >> 8) as u8, c as u8])
.unwrap_or("\u{fffd}").to_owned()
}
}
}
Expand Down Expand Up @@ -1272,6 +1294,17 @@ pub enum ColorSpace {
ICCBased(Vec<u8>)
}

macro_rules! colorspace_arg {
($ok:ident, $e:expr, $($format:expr),*) => {
if let $ok(v) = $e {
v
} else {
dlog!($($format),*);
return ColorSpace::DeviceRGB
}
};
}

fn make_colorspace<'a>(doc: &'a Document, name: &[u8], resources: &'a Dictionary) -> ColorSpace {
match name {
b"DeviceGray" => ColorSpace::DeviceGray,
Expand All @@ -1280,12 +1313,12 @@ fn make_colorspace<'a>(doc: &'a Document, name: &[u8], resources: &'a Dictionary
b"Pattern" => ColorSpace::Pattern,
_ => {
let colorspaces: &Dictionary = get(&doc, resources, b"ColorSpace");
let cs = maybe_get_array(doc, colorspaces, &name[..]).unwrap_or_else(|| panic!("missing colorspace {:?}", &name[..]));
let cs_name = pdf_to_utf8(cs[0].as_name().expect("first arg must be a name"));
let cs = colorspace_arg!(Some, maybe_get_array(doc, colorspaces, &name[..]), "missing colorspace {:?}", &name[..]);
let cs_name = pdf_to_utf8(colorspace_arg!(Ok, cs[0].as_name(), "first arg must be a name"));
match cs_name.as_ref() {
"Separation" => {
let name = pdf_to_utf8(cs[1].as_name().expect("second arg must be a name"));
let alternate_space = pdf_to_utf8(cs[2].as_name().expect("second arg must be a name"));
let name = pdf_to_utf8(colorspace_arg!(Ok, cs[1].as_name(), "second arg must be a name"));
let alternate_space = pdf_to_utf8(colorspace_arg!(Ok, cs[2].as_name(), "second arg must be a name"));
let tint_transform = Box::new(Function::new(doc, maybe_deref(doc, &cs[3])));

dlog!("{:?} {:?} {:?}", name, alternate_space, tint_transform);
Expand All @@ -1298,15 +1331,15 @@ fn make_colorspace<'a>(doc: &'a Document, name: &[u8], resources: &'a Dictionary
ColorSpace::ICCBased(get_contents(stream))
}
"CalGray" => {
let dict = cs[1].as_dict().expect("second arg must be a dict");
let dict = colorspace_arg!(Ok, cs[1].as_dict(), "second arg must be a dict");
ColorSpace::CalGray(CalGray {
white_point: get(&doc, dict, b"WhitePoint"),
black_point: get(&doc, dict, b"BackPoint"),
gamma: get(&doc, dict, b"Gamma"),
})
}
"CalRGB" => {
let dict = cs[1].as_dict().expect("second arg must be a dict");
let dict = colorspace_arg!(Ok, cs[1].as_dict(), "second arg must be a dict");
ColorSpace::CalRGB(CalRGB {
white_point: get(&doc, dict, b"WhitePoint"),
black_point: get(&doc, dict, b"BackPoint"),
Expand All @@ -1315,7 +1348,7 @@ fn make_colorspace<'a>(doc: &'a Document, name: &[u8], resources: &'a Dictionary
})
}
"Lab" => {
let dict = cs[1].as_dict().expect("second arg must be a dict");
let dict = colorspace_arg!(Ok, cs[1].as_dict(), "second arg must be a dict");
ColorSpace::Lab(Lab {
white_point: get(&doc, dict, b"WhitePoint"),
black_point: get(&doc, dict, b"BackPoint"),
Expand All @@ -1326,7 +1359,8 @@ fn make_colorspace<'a>(doc: &'a Document, name: &[u8], resources: &'a Dictionary
ColorSpace::Pattern
}
_ => {
panic!("color_space {:?} {:?} {:?}", name, cs_name, cs)
dlog!("color_space {:?} {:?} {:?}", name, cs_name, cs);
ColorSpace::DeviceRGB
}
}
}
Expand Down Expand Up @@ -1487,7 +1521,7 @@ impl<'a> Processor<'a> {
gs.ts.font = Some(font);

gs.ts.font_size = as_num(&operation.operands[1]);
dlog!("font {} size: {} {:?}", name, gs.ts.font_size, operation);
dlog!("font {:?} size: {} {:?}", name, gs.ts.font_size, operation);
}
"Ts" => {
gs.ts.rise = as_num(&operation.operands[0]);
Expand Down Expand Up @@ -1585,13 +1619,15 @@ impl<'a> Processor<'a> {
as_num(&operation.operands[3])))
}
"y" => {
path.ops.push(PathOp::CurveTo(
as_num(&operation.operands[0]),
as_num(&operation.operands[1]),
as_num(&operation.operands[2]),
as_num(&operation.operands[3]),
as_num(&operation.operands[2]),
as_num(&operation.operands[3])))
if operation.operands.len() > 3 {
path.ops.push(PathOp::CurveTo(
as_num(&operation.operands[0]),
as_num(&operation.operands[1]),
as_num(&operation.operands[2]),
as_num(&operation.operands[3]),
as_num(&operation.operands[2]),
as_num(&operation.operands[3])))
}
}
"h" => { path.ops.push(PathOp::Close) }
"re" => {
Expand Down Expand Up @@ -1923,7 +1959,7 @@ impl<W: ConvertToFmt> OutputDev for PlainTextOutput<W> {
fn end_page(&mut self) -> Result<(), OutputError> {
Ok(())
}
fn output_character(&mut self, trm: &Transform, width: f64, _spacing: f64, font_size: f64, char: &str) -> Result<(), OutputError> {
fn output_character(&mut self, trm: &Transform, width: f64, _spacing: f64, font_size: f64, c: &str) -> Result<(), OutputError> {
let position = trm.post_transform(&self.flip_ctm);
let transformed_font_size_vec = trm.transform_vector(vec2(font_size, font_size));
// get the length of one sized of the square with the same area with a rectangle of size (x, y)
Expand All @@ -1947,7 +1983,7 @@ impl<W: ConvertToFmt> OutputDev for PlainTextOutput<W> {
}
}
//let norm = unicode_normalization::UnicodeNormalization::nfkc(char);
write!(self.writer, "{}", char)?;
write!(self.writer, "{}", c)?;
self.first_char = false;
self.last_y = y;
self.last_end = x + width * transformed_font_size;
Expand All @@ -1970,7 +2006,7 @@ pub fn print_metadata(doc: &Document) {
if let Some(ref info) = get_info(&doc) {
for (k, v) in *info {
match v {
&Object::String(ref s, StringFormat::Literal) => { dlog!("{}: {}", k, pdf_to_utf8(s)); }
&Object::String(ref s, StringFormat::Literal) => { dlog!("{:?}: {}", k, pdf_to_utf8(s)); }
_ => {}
}
}
Expand Down