Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Read specific columns as str #52

Merged
merged 3 commits into from
Feb 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 42 additions & 10 deletions src/starfile/functions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Dict, List, Union
from typing import TYPE_CHECKING, Dict, List, Union, Optional

if TYPE_CHECKING:
import pandas as pd
Expand All @@ -15,15 +15,32 @@
from os import PathLike


def read(filename: PathLike, read_n_blocks: int = None, always_dict: bool = False):
"""
Read a star file into a pandas dataframe or dict of pandas dataframes
def read(
filename: PathLike,
read_n_blocks: Optional[int] = None,
always_dict: bool = False,
parse_as_string: List[str] = []
) -> Union[DataBlock, Dict[DataBlock]]:
"""Read data from a STAR file.

default behaviour in the case of only one data block being present in the STAR file is to
return only a dataframe, this can be changed by setting 'always_dict=True'
"""
Basic data blocks are read as dictionaries. Loop blocks are read as pandas
dataframes. When multiple data blocks are present a dictionary of datablocks is
returned. When a single datablock is present only the block is returned by default.
To force returning a dectionary even when only one datablock is present set
`always_dict=True`.

parser = StarParser(filename, n_blocks_to_read=read_n_blocks)
Parameters
----------
filename: PathLike
File from which to read data.
read_n_blocks: int | None
Limit reading the file to the first n data blocks.
always_dict: bool
Always return a dictionary, even when only a single data block is present.
parse_as_string: list[str]
A list of keys or column names which will not be coerced to numeric values.
"""
parser = StarParser(filename, n_blocks_to_read=read_n_blocks, parse_as_string=parse_as_string)
if len(parser.data_blocks) == 1 and always_dict is False:
return list(parser.data_blocks.values())[0]
else:
Expand All @@ -38,9 +55,24 @@ def write(
na_rep: str = '<NA>',
quote_character: str = '"',
quote_all_strings: bool = False,
**kwargs,
**kwargs
):
"""Write data blocks as STAR files."""
"""Write data to disk in the STAR format.

Parameters
----------
data: DataBlock | Dict[str, DataBlock] | List[DataBlock]
Data to be saved to file. DataBlocks are dictionaries or dataframes.
If a dictionary of datablocks are passed the keys will be the data block names.
filename: PathLike
Path where the file will be saved.
float_format: str
Float format string which will be passed to pandas.
sep: str
Separator between values, will be passed to pandas.
na_rep: str
Representation of null values, will be passed to pandas.
"""
StarWriter(
data,
filename=filename,
Expand Down
52 changes: 39 additions & 13 deletions src/starfile/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,28 @@
import numpy as np
import pandas as pd
from pathlib import Path
from typing import TYPE_CHECKING, Union, Optional, Dict, Tuple
from typing import TYPE_CHECKING, Union, Optional, Dict, Tuple, List

from starfile.typing import DataBlock

if TYPE_CHECKING:
from os import PathLike

def _apply_numeric(col: pd.Series) -> pd.Series:
try:
return pd.to_numeric(col)
except ValueError:
return col

class StarParser:
filename: Path
n_lines_in_file: int
n_blocks_to_read: int
current_line_number: int
data_blocks: Dict[DataBlock]

def __init__(self, filename: PathLike, n_blocks_to_read: Optional[int] = None):
parse_as_string: List[str]

def __init__(
self,
filename: PathLike,
n_blocks_to_read: Optional[int] = None,
parse_as_string: List[str] = [],
):
# set filename, with path checking
filename = Path(filename)
if not filename.exists():
Expand All @@ -39,6 +40,7 @@ def __init__(self, filename: PathLike, n_blocks_to_read: Optional[int] = None):
self.data_blocks = {}
self.n_lines_in_file = count_lines(self.filename)
self.n_blocks_to_read = n_blocks_to_read
self.parse_as_string = parse_as_string

# parse file
self.current_line_number = 0
Expand Down Expand Up @@ -78,7 +80,15 @@ def _parse_simple_block(self) -> Dict[str, Union[str, int, float]]:
break
elif self.current_line.startswith('_'): # '_foo bar'
k, v = shlex.split(self.current_line)
block[k[1:]] = numericise(v)
column_name = k[1:]
parse_column_as_string = (
self.parse_as_string is not None
and any(column_name == col for col in self.parse_as_string)
)
if parse_column_as_string is True:
block[column_name] = v
else:
block[column_name] = numericise(v)
self.current_line_number += 1
return block

Expand Down Expand Up @@ -108,18 +118,27 @@ def _parse_loop_block(self) -> pd.DataFrame:
n_cols = len(loop_column_names)
df = pd.DataFrame(np.zeros(shape=(0, n_cols)))
else:
column_name_to_index = {col: idx for idx, col in enumerate(loop_column_names)}
df = pd.read_csv(
StringIO(loop_data.replace("'", '"')),
delimiter=r'\s+',
header=None,
comment='#',
keep_default_na=False
dtype={column_name_to_index[k]: str for k in self.parse_as_string if k in loop_column_names},
keep_default_na=False,
engine='c',
)
df.columns = loop_column_names

# Numericise all columns in temporary copy
df_numeric = df.apply(_apply_numeric)
# Replace columns that are all NaN with the original string columns

# Replace columns that are all NaN with the original columns
df_numeric[df_numeric.columns[df_numeric.isna().all()]] = df[df_numeric.columns[df_numeric.isna().all()]]
df = df_numeric
df.columns = loop_column_names

# Replace columns that should be strings
for col in df.columns:
df[col] = df_numeric[col] if col not in self.parse_as_string else df[col]
return df


Expand Down Expand Up @@ -150,3 +169,10 @@ def numericise(value: str) -> Union[str, int, float]:
# If it's not a float either, leave it as a string
value = value
return value


def _apply_numeric(col: pd.Series) -> pd.Series:
try:
return pd.to_numeric(col)
except ValueError:
return col
43 changes: 28 additions & 15 deletions tests/test_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,34 +243,47 @@ def test_empty_loop_block():
assert len(parser.data_blocks) == 1



@pytest.mark.parametrize("quote_character, filename", [("'",basic_single_quote),
('"',basic_double_quote),
])
def test_quote_basic(quote_character,filename):
@pytest.mark.parametrize("quote_character, filename", [("'", basic_single_quote),
('"', basic_double_quote),
])
def test_quote_basic(quote_character, filename):
parser = StarParser(filename)
assert len(parser.data_blocks) == 1
assert parser.data_blocks['']['no_quote_string'] == "noquote"
assert parser.data_blocks['']['quote_string'] == "quote string"
assert parser.data_blocks['']['whitespace_string'] == " "
assert parser.data_blocks['']['empty_string'] == ""

@pytest.mark.parametrize("quote_character, filename", [("'",loop_single_quote),
('"',loop_double_quote),
])
def test_quote_loop(quote_character,filename):

@pytest.mark.parametrize("quote_character, filename", [("'", loop_single_quote),
('"', loop_double_quote),
])
def test_quote_loop(quote_character, filename):
import math
parser = StarParser(filename)
assert len(parser.data_blocks) == 1
assert parser.data_blocks[''].loc[0,'no_quote_string'] == "noquote"
assert parser.data_blocks[''].loc[0,'quote_string'] == "quote string"
assert parser.data_blocks[''].loc[0,'whitespace_string'] == " "
assert parser.data_blocks[''].loc[0,'empty_string'] == ""
assert parser.data_blocks[''].loc[0, 'no_quote_string'] == "noquote"
assert parser.data_blocks[''].loc[0, 'quote_string'] == "quote string"
assert parser.data_blocks[''].loc[0, 'whitespace_string'] == " "
assert parser.data_blocks[''].loc[0, 'empty_string'] == ""

assert parser.data_blocks[''].dtypes['number_and_string'] == object
assert parser.data_blocks[''].dtypes['number_and_empty'] == 'float64'
assert parser.data_blocks[''].dtypes['number'] == 'float64'
assert parser.data_blocks[''].dtypes['empty_string_and_normal_string'] == object

assert math.isnan(parser.data_blocks[''].loc[1,'number_and_empty'])
assert parser.data_blocks[''].loc[0,'empty_string_and_normal_string'] == ''
assert math.isnan(parser.data_blocks[''].loc[1, 'number_and_empty'])
assert parser.data_blocks[''].loc[0, 'empty_string_and_normal_string'] == ''


def test_parse_as_string():
parser = StarParser(postprocess, parse_as_string=['rlnFinalResolution', 'rlnResolution'])

# check 'rlnFinalResolution' is parsed as string in general (basic) block
block = parser.data_blocks['general']
assert type(block['rlnFinalResolution']) == str

# check 'rlnResolution' is parsed as string in fsc (loop) block
df = parser.data_blocks['fsc']
assert df['rlnResolution'].dtype == 'object'