ansible-collections · ddimatos · Apr 14, 2023 · Apr 11, 2023 · Apr 11, 2023 · Apr 14, 2023
diff --git a/changelogs/fragments/732-zos_copy-encoding-bugs.yml b/changelogs/fragments/732-zos_copy-encoding-bugs.yml
@@ -0,0 +1,9 @@
+bugfixes:
+- zos_copy - Fixes a bug where files not encoded in IBM-1047
+  would trigger an error while computing the record length
+  for a new destination dataset. Issue 664.
+  (https://github.com/ansible-collections/ibm_zos_core/pull/732)
+- zos_copy - Fixes a bug where the code for fixing an issue with
+  newlines in files (issue 599) would use the wrong encoding
+  for normalization. Issue 678.
+  (https://github.com/ansible-collections/ibm_zos_core/pull/732)
diff --git a/plugins/modules/zos_copy.py b/plugins/modules/zos_copy.py
@@ -931,15 +931,20 @@ def file_has_crlf_endings(self, src):
             {bool} -- True if the file uses CRLF endings, False if it uses LF
                       ones.
         """
+        # Python has to read the file in binary mode to not mask CRLF
+        # endings or enable universal newlines. If we used encoding="cp037",
+        # we would get '\n' as the line ending even when the file uses '\r\n'.
         with open(src, "rb") as src_file:
-            # readline() will read until it finds a \n.
-            content = src_file.readline()
+            # Reading the file in 1024-byte chunks.
+            content = src_file.read(1024)
 
-        # In EBCDIC, \r\n are bytes 0d and 15, respectively.
-        if content.endswith(b'\x0d\x15'):
-            return True
-        else:
-            return False
+            while content:
+                # In EBCDIC, \r\n are bytes 0d and 15, respectively.
+                if b'\x0d\x15' in content:
+                    return True
+                content = src_file.read(1024)
+
+        return False
 
     def create_temp_with_lf_endings(self, src):
         """Creates a temporary file with the same content as src but without
@@ -960,10 +965,11 @@ def create_temp_with_lf_endings(self, src):
 
             with open(converted_src, "wb") as converted_file:
                 with open(src, "rb") as src_file:
-                    current_line = src_file.read()
-                    converted_file.write(current_line.replace(b'\x0d', b''))
+                    chunk = src_file.read(1024)
+                    # In IBM-037, \r is the byte 0d.
+                    converted_file.write(chunk.replace(b'\x0d', b''))
 
-            self._tag_file_encoding(converted_src, encode.Defaults.DEFAULT_EBCDIC_MVS_CHARSET)
+            self._tag_file_encoding(converted_src, "IBM-037")
 
             return converted_src
         except Exception as err:
@@ -1319,6 +1325,7 @@ def copy_to_pdse(
         src_ds_type,
         src_member=None,
         dest_member=None,
+        encoding=None,
     ):
         """Copy source to a PDS/PDSE or PDS/PDSE member.
 
@@ -1328,12 +1335,13 @@ def copy_to_pdse(
         Arguments:
             src {str} -- Path to USS file/directory or data set name.
             temp_path {str} -- Path to the location where the control node
-                               transferred data to
+                               transferred data to.
             conv_path {str} -- Path to the converted source file/directory
-            dest {str} -- Name of destination data set
-            src_ds_type {str} -- The type of source
+            dest {str} -- Name of destination data set.
+            src_ds_type {str} -- The type of source.
             src_member {bool, optional} -- Member of the source data set to copy.
-            dest_member {str, optional} -- Name of destination member in data set
+            dest_member {str, optional} -- Name of destination member in data set.
+            encoding {dict, optional} -- Dictionary with encoding options.
         """
         new_src = conv_path or temp_path or src
 
@@ -1352,6 +1360,9 @@ def copy_to_pdse(
                 else:
                     dest_copy_name = "{0}({1})".format(dest, data_set.DataSet.get_member_name_from_file(file))
 
+                if not self.is_binary:
+                    full_file_path = normalize_line_endings(full_file_path, encoding)
+
                 result = self.copy_to_member(full_file_path, dest_copy_name)
 
                 if result["rc"] != 0:
@@ -1459,7 +1470,7 @@ def get_file_record_length(file):
     """
     max_line_length = 0
 
-    with open(file, "r") as src_file:
+    with open(file, "r", encoding="utf-8") as src_file:
         current_line = src_file.readline()
 
         while current_line:
@@ -2021,6 +2032,51 @@ def allocate_destination_data_set(
     return True
 
 
+def normalize_line_endings(src, encoding=None):
+    """
+    Normalizes src's encoding to IBM-037 (a dataset's default) and then normalizes
+    its line endings to LF.
+    Arguments:
+        src (str) -- Path of a USS file.
+        encoding (dict, optional) -- Encoding options for the module.
+    Returns:
+        str -- Path to the normalized file.
+    """
+    # Before copying into a destination dataset, we'll make sure that
+    # the source file doesn't contain any carriage returns that would
+    # result in empty records in the destination.
+    # Due to the differences between encodings, we'll normalize to IBM-037
+    # before checking the EOL sequence.
+    enc_utils = encode.EncodeUtils()
+    src_tag = enc_utils.uss_file_tag(src)
+    copy_handler = CopyHandler(AnsibleModuleHelper(dict()))
+
+    if src_tag == "untagged":
+        # This should only be true when src is a remote file and no encoding
+        # was specified by the user.
+        if not encoding:
+            encoding = {"from": encode.Defaults.get_default_system_charset()}
+        src_tag = encoding["from"]
+
+    if src_tag != "IBM-037":
+        fd, converted_src = tempfile.mkstemp()
+        os.close(fd)
+
+        enc_utils.uss_convert_encoding(
+            src,
+            converted_src,
+            src_tag,
+            "IBM-037"
+        )
+        copy_handler._tag_file_encoding(converted_src, "IBM-037")
+        src = converted_src
+
+    if copy_handler.file_has_crlf_endings(src):
+        src = copy_handler.create_temp_with_lf_endings(src)
+
+    return src
+
+
 def run_module(module, arg_def):
     # ********************************************************************
     # Verify the validity of module args. BetterArgParser raises ValueError
@@ -2102,13 +2158,43 @@ def run_module(module, arg_def):
     # and destination datasets, if needed.
     # ********************************************************************
     dest_member_exists = False
+    converted_src = None
     try:
         # If temp_path, the plugin has copied a file from the controller to USS.
         if temp_path or "/" in src:
             src_ds_type = "USS"
 
             if remote_src and os.path.isdir(src):
                 is_src_dir = True
+
+            if not is_uss:
+                new_src = temp_path or src
+                new_src = os.path.normpath(new_src)
+                # Normalizing encoding when src is a USS file (only).
+                encode_utils = encode.EncodeUtils()
+                src_tag = encode_utils.uss_file_tag(new_src)
+                # Normalizing to UTF-8.
+                if not is_src_dir and src_tag != "UTF-8":
+                    # If untagged, assuming the encoding/tag is the system's default.
+                    if src_tag == "untagged" or src_tag is None:
+                        if encoding:
+                            src_tag = encoding["from"]
+                        else:
+                            src_tag = encode.Defaults.get_default_system_charset()
+
+                    # Converting the original src to a temporary one in UTF-8.
+                    fd, converted_src = tempfile.mkstemp()
+                    os.close(fd)
+                    encode_utils.uss_convert_encoding(
+                        new_src,
+                        converted_src,
+                        src_tag,
+                        "UTF-8"
+                    )
+
+                    # Creating the handler just for tagging, we're not copying yet!
+                    copy_handler = CopyHandler(module, is_binary=is_binary)
+                    copy_handler._tag_file_encoding(converted_src, "UTF-8")
         else:
             if data_set.DataSet.data_set_exists(src_name):
                 if src_member and not data_set.DataSet.data_set_member_exists(src):
@@ -2284,6 +2370,14 @@ def run_module(module, arg_def):
             emergency_backup = data_set.DataSet.temp_name()
             data_set.DataSet.allocate_model_data_set(emergency_backup, dest_name)
 
+    if converted_src:
+        if remote_src:
+            original_src = src
+            src = converted_src
+        else:
+            original_temp = temp_path
+            temp_path = converted_src
+
     try:
         if not is_uss:
             res_args["changed"] = allocate_destination_data_set(
@@ -2300,8 +2394,19 @@ def run_module(module, arg_def):
         if dest_exists and not force:
             restore_backup(dest_name, emergency_backup, dest_ds_type, use_backup)
             erase_backup(emergency_backup, dest_ds_type)
+        if converted_src:
+            if remote_src:
+                src = original_src
+            else:
+                temp_path = original_temp
         module.fail_json(msg="Unable to allocate destination data set: {0}".format(str(err)))
 
+    if converted_src:
+        if remote_src:
+            src = original_src
+        else:
+            temp_path = original_temp
+
     # ********************************************************************
     # Encoding conversion is only valid if the source is a local file,
     # local directory or a USS file/directory.
@@ -2370,35 +2475,8 @@ def run_module(module, arg_def):
         # ---------------------------------------------------------------------
         elif dest_ds_type in data_set.DataSet.MVS_SEQ:
             if src_ds_type == "USS" and not is_binary:
-                # Before copying into the destination dataset, we'll make sure that
-                # the source file doesn't contain any carriage returns that would
-                # result in empty records in the destination.
-                # Due to the differences between encodings, we'll normalize to IBM-037
-                # before checking the EOL sequence.
                 new_src = conv_path or temp_path or src
-                enc_utils = encode.EncodeUtils()
-                src_tag = enc_utils.uss_file_tag(new_src)
-
-                if src_tag == "untagged":
-                    src_tag = encode.Defaults.DEFAULT_EBCDIC_USS_CHARSET
-
-                if src_tag not in encode.Defaults.DEFAULT_EBCDIC_MVS_CHARSET:
-                    fd, converted_src = tempfile.mkstemp()
-                    os.close(fd)
-
-                    enc_utils.uss_convert_encoding(
-                        new_src,
-                        converted_src,
-                        src_tag,
-                        encode.Defaults.DEFAULT_EBCDIC_MVS_CHARSET
-                    )
-                    copy_handler._tag_file_encoding(converted_src, encode.Defaults.DEFAULT_EBCDIC_MVS_CHARSET)
-                    new_src = converted_src
-
-                if copy_handler.file_has_crlf_endings(new_src):
-                    new_src = copy_handler.create_temp_with_lf_endings(new_src)
-
-                conv_path = new_src
+                conv_path = normalize_line_endings(new_src, encoding)
 
             copy_handler.copy_to_seq(
                 src,
@@ -2421,7 +2499,14 @@ def run_module(module, arg_def):
             )
 
             pdse_copy_handler.copy_to_pdse(
-                src, temp_path, conv_path, dest_name, src_ds_type, src_member=src_member, dest_member=dest_member
+                src,
+                temp_path,
+                conv_path,
+                dest_name,
+                src_ds_type,
+                src_member=src_member,
+                dest_member=dest_member,
+                encoding=encoding
             )
             res_args["changed"] = True
             dest = dest.upper()

diff --git a/tests/functional/modules/test_zos_copy_func.py b/tests/functional/modules/test_zos_copy_func.py
@@ -35,6 +35,8 @@
 DUMMY DATA ---- LINE 007 ------
 """
 
+DUMMY_DATA_CRLF = b"00000001 DUMMY DATA\r\n00000002 DUMMY DATA\r\n"
+
 VSAM_RECORDS = """00000001A record
 00000002A record
 00000003A record
@@ -109,6 +111,12 @@ def populate_dir(dir_path):
             infile.write(DUMMY_DATA)
 
 
+def populate_dir_crlf_endings(dir_path):
+    for i in range(5):
+        with open(os.path.join(dir_path, "file{0}".format(i)), "wb") as infile:
+            infile.write(DUMMY_DATA_CRLF)
+
+
 def populate_partitioned_data_set(hosts, name, ds_type, members=None):
     """Creates a new partitioned data set and inserts records into various
     members of it.
@@ -1058,6 +1066,56 @@ def test_copy_file_record_length_to_sequential_data_set(ansible_zos_module):
         os.remove(src)
 
 
+@pytest.mark.uss
+@pytest.mark.seq
+def test_copy_file_crlf_endings_to_sequential_data_set(ansible_zos_module):
+    hosts = ansible_zos_module
+    dest = "USER.TEST.SEQ.FUNCTEST"
+
+    fd, src = tempfile.mkstemp()
+    os.close(fd)
+    with open(src, "wb") as infile:
+        infile.write(DUMMY_DATA_CRLF)
+
+    try:
+        hosts.all.zos_data_set(name=dest, state="absent")
+
+        copy_result = hosts.all.zos_copy(
+            src=src,
+            dest=dest,
+            remote_src=False,
+            is_binary=False
+        )
+
+        verify_copy = hosts.all.shell(
+            cmd="cat \"//'{0}'\"".format(dest),
+            executable=SHELL_EXECUTABLE,
+        )
+
+        verify_recl = hosts.all.shell(
+            cmd="dls -l {0}".format(dest),
+            executable=SHELL_EXECUTABLE,
+        )
+
+        for cp_res in copy_result.contacted.values():
+            assert cp_res.get("msg") is None
+            assert cp_res.get("changed") is True
+            assert cp_res.get("dest") == dest
+        for v_cp in verify_copy.contacted.values():
+            assert v_cp.get("rc") == 0
+            assert len(v_cp.get("stdout_lines")) == 2
+        for v_recl in verify_recl.contacted.values():
+            assert v_recl.get("rc") == 0
+            stdout = v_recl.get("stdout").split()
+            assert len(stdout) == 5
+            assert stdout[1] == "PS"
+            assert stdout[2] == "FB"
+            assert stdout[3] == "19"
+    finally:
+        hosts.all.zos_data_set(name=dest, state="absent")
+        os.remove(src)
+
+
 @pytest.mark.uss
 @pytest.mark.seq
 @pytest.mark.parametrize("src", [
@@ -1614,6 +1672,38 @@ def test_copy_dir_to_non_existing_pdse(ansible_zos_module):
         hosts.all.zos_data_set(name=dest, state="absent")
 
 
+@pytest.mark.uss
+@pytest.mark.pdse
+def test_copy_dir_crlf_endings_to_non_existing_pdse(ansible_zos_module):
+    hosts = ansible_zos_module
+    dest = "USER.TEST.PDSE.FUNCTEST"
+
+    temp_path = tempfile.mkdtemp()
+    src_basename = "source/"
+    source_path = "{0}/{1}".format(temp_path, src_basename)
+
+    try:
+        os.mkdir(source_path)
+        populate_dir_crlf_endings(source_path)
+
+        copy_res = hosts.all.zos_copy(src=source_path, dest=dest)
+        verify_copy = hosts.all.shell(
+            cmd="cat \"//'{0}({1})'\"".format(dest, "FILE2"),
+            executable=SHELL_EXECUTABLE,
+        )
+
+        for result in copy_res.contacted.values():
+            assert result.get("msg") is None
+            assert result.get("changed") is True
+            assert result.get("dest") == dest
+        for result in verify_copy.contacted.values():
+            assert result.get("rc") == 0
+            assert len(result.get("stdout_lines")) == 2
+    finally:
+        shutil.rmtree(temp_path)
+        hosts.all.zos_data_set(name=dest, state="absent")
+
+
 @pytest.mark.uss
 @pytest.mark.pdse
 @pytest.mark.parametrize("src_type", ["pds", "pdse"])