fix bug of gfile compatibility (#536)

yangxudong · yanzhen1233 · web-flow · commit d147a2310af8 · 2025-06-04T15:02:45.000+08:00
* fix bug of gfile compatibility

---------

Co-authored-by: yanzhen1233 &lt;wb-yz775491@alibaba-inc.com&gt;
diff --git a/easy_rec/python/core/sampler.py b/easy_rec/python/core/sampler.py
@@ -8,7 +8,6 @@
 import math
 import os
 import sys
-# import re
 import threading
 
 import numpy as np
@@ -20,6 +19,11 @@
 from easy_rec.python.utils.config_util import process_multi_file_input_path
 from easy_rec.python.utils.tf_utils import get_tf_type
 
+if tf.__version__.startswith('1.'):
+  from tensorflow.python.platform import gfile
+else:
+  import tensorflow.io.gfile as gfile
+
 
 # patch graph-learn string_attrs for utf-8
 @property
@@ -395,7 +399,7 @@ def _load_data(self, data_path, attr_delimiter):
     item_id_col = 0
     fea_id_col = 2
     print('NegativeSamplerInMemory: load sample feature from %s' % data_path)
-    with tf.gfile.GFile(data_path, 'r') as fin:
+    with gfile.GFile(data_path, 'r') as fin:
       for line_id, line_str in enumerate(fin):
         line_str = line_str.strip()
         cols = line_str.split('\t')
diff --git a/easy_rec/python/export.py b/easy_rec/python/export.py
@@ -5,13 +5,17 @@
 
 import tensorflow as tf
 from tensorflow.python.lib.io import file_io
-from tensorflow.python.platform import gfile
 
 from easy_rec.python.main import export
 from easy_rec.python.protos.train_pb2 import DistributionStrategy
 from easy_rec.python.utils import config_util
 from easy_rec.python.utils import estimator_utils
 
+if tf.__version__.startswith('1.'):
+  from tensorflow.python.platform import gfile
+else:
+  import tensorflow.io.gfile as gfile
+
 if tf.__version__ >= '2.0':
   tf = tf.compat.v1
 
diff --git a/easy_rec/python/input/criteo_input.py b/easy_rec/python/input/criteo_input.py
@@ -3,7 +3,6 @@
 import logging
 
 import tensorflow as tf
-from tensorflow.python.platform import gfile
 
 from easy_rec.python.input.criteo_binary_reader import BinaryDataset
 from easy_rec.python.input.input import Input
@@ -38,9 +37,9 @@ def __init__(self,
       for label_path, dense_path, category_path in zip(
           input_path.label_path, input_path.dense_path,
           input_path.category_path):
-        label_paths = gfile.Glob(input_path.label_path)
-        dense_paths = gfile.Glob(input_path.dense_path)
-        category_paths = gfile.Glob(input_path.category_path)
+        label_paths = tf.gfile.Glob(input_path.label_path)
+        dense_paths = tf.gfile.Glob(input_path.dense_path)
+        category_paths = tf.gfile.Glob(input_path.category_path)
         assert len(label_paths) == len(dense_paths) and len(label_paths) == \
             len(category_paths), 'label_path(%s) dense_path(%s) category_path(%s) ' + \
             'matched different number of files(%d %d %d)' % (
diff --git a/easy_rec/python/input/datahub_input.py b/easy_rec/python/input/datahub_input.py
@@ -6,12 +6,16 @@
 
 import tensorflow as tf
 from tensorflow.python.framework import dtypes
-from tensorflow.python.platform import gfile
 
 from easy_rec.python.input.input import Input
 from easy_rec.python.utils import odps_util
 from easy_rec.python.utils.config_util import parse_time
 
+if tf.__version__.startswith('1.'):
+  from tensorflow.python.platform import gfile
+else:
+  import tensorflow.io.gfile as gfile
+
 try:
   import common_io
 except Exception:
diff --git a/easy_rec/python/input/kafka_input.py b/easy_rec/python/input/kafka_input.py
@@ -6,12 +6,16 @@
 
 import six
 import tensorflow as tf
-from tensorflow.python.platform import gfile
 
 from easy_rec.python.input.input import Input
 from easy_rec.python.input.kafka_dataset import KafkaDataset
 from easy_rec.python.utils.config_util import parse_time
 
+if tf.__version__.startswith('1.'):
+  from tensorflow.python.platform import gfile
+else:
+  import tensorflow.io.gfile as gfile
+
 try:
   from kafka import KafkaConsumer, TopicPartition
 except ImportError:
diff --git a/easy_rec/python/input/odps_rtp_input_v2.py b/easy_rec/python/input/odps_rtp_input_v2.py
@@ -7,6 +7,10 @@
 
 from easy_rec.python.input.odps_rtp_input import OdpsRTPInput
 
+if tf.__version__.startswith('1.'):
+  from tensorflow.python.platform import gfile
+else:
+  import tensorflow.io.gfile as gfile
 try:
   import pai
   import rtp_fg
@@ -45,7 +49,7 @@ def __init__(self,
     logging.info('fg config path: {}'.format(self._fg_config_path))
     if self._fg_config_path is None:
       raise ValueError('fg_json_path is not set')
-    with tf.gfile.GFile(self._fg_config_path, 'r') as f:
+    with gfile.GFile(self._fg_config_path, 'r') as f:
       self._fg_config = json.load(f)
 
   def _parse_table(self, *fields):
diff --git a/easy_rec/python/input/parquet_input.py b/easy_rec/python/input/parquet_input.py
@@ -3,22 +3,18 @@
 import logging
 import multiprocessing
 import queue
-# import threading
 import time
 
-# import numpy as np
-# import pandas as pd
 import tensorflow as tf
-# from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-# from tensorflow.python.ops import logging_ops
-# from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import gfile
 
 from easy_rec.python.compat import queues
 from easy_rec.python.input import load_parquet
 from easy_rec.python.input.input import Input
 
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+
 
 class ParquetInput(Input):
 
@@ -40,7 +36,7 @@ def __init__(self,
 
     self._input_files = []
     for sub_path in input_path.strip().split(','):
-      self._input_files.extend(gfile.Glob(sub_path))
+      self._input_files.extend(tf.gfile.Glob(sub_path))
     logging.info('parquet input_path=%s file_num=%d' %
                  (input_path, len(self._input_files)))
     mp_ctxt = multiprocessing.get_context('spawn')
diff --git a/easy_rec/python/input/parquet_input_v3.py b/easy_rec/python/input/parquet_input_v3.py
@@ -3,7 +3,6 @@
 import logging
 
 import tensorflow as tf
-from tensorflow.python.platform import gfile
 
 from easy_rec.python.input.input import Input
 from easy_rec.python.utils.input_utils import get_type_defaults
@@ -19,6 +18,9 @@
   _has_deep_rec = False
   pass
 
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+
 
 class ParquetInputV3(Input):
 
@@ -114,7 +116,7 @@ def _parse_dataframe(self, df):
   def _build(self, mode, params):
     input_files = []
     for sub_path in self._input_path.strip().split(','):
-      input_files.extend(gfile.Glob(sub_path))
+      input_files.extend(tf.gfile.Glob(sub_path))
     file_num = len(input_files)
     logging.info('[task_index=%d] total_file_num=%d task_num=%d' %
                  (self._task_index, file_num, self._task_num))
diff --git a/easy_rec/python/main.py b/easy_rec/python/main.py
@@ -14,7 +14,6 @@
 import six
 import tensorflow as tf
 from tensorflow.core.protobuf import saved_model_pb2
-from tensorflow.python.platform import gfile
 
 import easy_rec
 from easy_rec.python.builders import strategy_builder
@@ -240,27 +239,27 @@ def _metric_cmp_fn(best_eval_result, current_eval_result):
 
 def _check_model_dir(model_dir, continue_train):
   if not continue_train:
-    if not gfile.IsDirectory(model_dir):
-      gfile.MakeDirs(model_dir)
+    if not tf.gfile.IsDirectory(model_dir):
+      tf.gfile.MakeDirs(model_dir)
     else:
-      assert len(gfile.Glob(model_dir + '/model.ckpt-*.meta')) == 0, \
+      assert len(tf.gfile.Glob(model_dir + '/model.ckpt-*.meta')) == 0, \
           'model_dir[=%s] already exists and not empty(if you ' \
           'want to continue train on current model_dir please ' \
           'delete dir %s or specify --continue_train[internal use only])' % (
               model_dir, model_dir)
   else:
-    if not gfile.IsDirectory(model_dir):
+    if not tf.gfile.IsDirectory(model_dir):
       logging.info('%s does not exists, create it automatically' % model_dir)
-      gfile.MakeDirs(model_dir)
+      tf.gfile.MakeDirs(model_dir)
 
 
 def _get_ckpt_path(pipeline_config, checkpoint_path):
   if checkpoint_path != '' and checkpoint_path is not None:
-    if gfile.IsDirectory(checkpoint_path):
+    if tf.gfile.IsDirectory(checkpoint_path):
       ckpt_path = estimator_utils.latest_checkpoint(checkpoint_path)
     else:
       ckpt_path = checkpoint_path
-  elif gfile.IsDirectory(pipeline_config.model_dir):
+  elif tf.gfile.IsDirectory(pipeline_config.model_dir):
     ckpt_path = estimator_utils.latest_checkpoint(pipeline_config.model_dir)
     logging.info('checkpoint_path is not specified, '
                  'will use latest checkpoint %s from %s' %
@@ -284,7 +283,8 @@ def train_and_evaluate(pipeline_config_path, continue_train=False):
   Returns:
     None, the model will be saved into pipeline_config.model_dir
   """
-  assert gfile.Exists(pipeline_config_path), 'pipeline_config_path not exists'
+  assert tf.gfile.Exists(
+      pipeline_config_path), 'pipeline_config_path not exists'
   pipeline_config = config_util.get_configs_from_pipeline_file(
       pipeline_config_path)
 
@@ -323,7 +323,7 @@ def _train_and_evaluate_impl(pipeline_config,
   if estimator_utils.is_chief():
     _check_model_dir(pipeline_config.model_dir, continue_train)
     config_util.save_pipeline_config(pipeline_config, pipeline_config.model_dir)
-    with gfile.GFile(version_file, 'w') as f:
+    with tf.gfile.GFile(version_file, 'w') as f:
       f.write(easy_rec.__version__ + '\n')
 
   train_steps = None
@@ -509,7 +509,7 @@ def evaluate(pipeline_config,
   model_dir = pipeline_config.model_dir
   eval_result_file = os.path.join(model_dir, eval_result_filename)
   logging.info('save eval result to file %s' % eval_result_file)
-  with gfile.GFile(eval_result_file, 'w') as ofile:
+  with tf.gfile.GFile(eval_result_file, 'w') as ofile:
     result_to_write = {}
     for key in sorted(eval_result):
       # skip logging binary data
@@ -562,10 +562,10 @@ def distribute_evaluate(pipeline_config,
     return eval_result
   model_dir = get_model_dir_path(pipeline_config)
   eval_tmp_results_dir = os.path.join(model_dir, 'distribute_eval_tmp_results')
-  if not gfile.IsDirectory(eval_tmp_results_dir):
+  if not tf.gfile.IsDirectory(eval_tmp_results_dir):
     logging.info('create eval tmp results dir {}'.format(eval_tmp_results_dir))
-    gfile.MakeDirs(eval_tmp_results_dir)
-  assert gfile.IsDirectory(
+    tf.gfile.MakeDirs(eval_tmp_results_dir)
+  assert tf.gfile.IsDirectory(
       eval_tmp_results_dir), 'tmp results dir not create success.'
   os.environ['eval_tmp_results_dir'] = eval_tmp_results_dir
 
@@ -679,7 +679,7 @@ def distribute_evaluate(pipeline_config,
   if cur_job_name == 'master':
     print('eval_result = ', eval_result)
     logging.info('eval_result = {0}'.format(eval_result))
-    with gfile.GFile(eval_result_file, 'w') as ofile:
+    with tf.gfile.GFile(eval_result_file, 'w') as ofile:
       result_to_write = {'eval_method': 'distribute'}
       for key in sorted(eval_result):
         # skip logging binary data
@@ -766,8 +766,8 @@ def export(export_dir,
     AssertionError, if:
       * pipeline_config_path does not exist
   """
-  if not gfile.Exists(export_dir):
-    gfile.MakeDirs(export_dir)
+  if not tf.gfile.Exists(export_dir):
+    tf.gfile.MakeDirs(export_dir)
 
   pipeline_config = config_util.get_configs_from_pipeline_file(pipeline_config)
   if pipeline_config.fg_json_path:
@@ -830,10 +830,10 @@ def export(export_dir,
   ]
   export_ts = export_ts[-1]
   saved_pb_path = os.path.join(final_export_dir, 'saved_model.pb')
-  with gfile.GFile(saved_pb_path, 'rb') as fin:
+  with tf.gfile.GFile(saved_pb_path, 'rb') as fin:
     saved_model.ParseFromString(fin.read())
   saved_model.meta_graphs[0].meta_info_def.meta_graph_version = export_ts
-  with gfile.GFile(saved_pb_path, 'wb') as fout:
+  with tf.gfile.GFile(saved_pb_path, 'wb') as fout:
     fout.write(saved_model.SerializeToString())
 
   logging.info('model has been exported to %s successfully' % final_export_dir)
diff --git a/easy_rec/python/test/hpo_test.py b/easy_rec/python/test/hpo_test.py
@@ -15,13 +15,13 @@
 from easy_rec.python.utils import test_utils
 
 if tf.__version__ >= '2.0':
-  gfile = tf.compat.v1.gfile
+  import tensorflow.io.gfile as gfile
   from tensorflow.core.protobuf import config_pb2
 
   ConfigProto = config_pb2.ConfigProto
   GPUOptions = config_pb2.GPUOptions
 else:
-  gfile = tf.gfile
+  from tensorflow.python.platform import gfile
   GPUOptions = tf.GPUOptions
   ConfigProto = tf.ConfigProto
 
diff --git a/easy_rec/python/train_eval.py b/easy_rec/python/train_eval.py
@@ -6,7 +6,6 @@
 import os
 
 import tensorflow as tf
-from tensorflow.python.platform import gfile
 
 from easy_rec.python.main import _train_and_evaluate_impl
 from easy_rec.python.protos.train_pb2 import DistributionStrategy
@@ -19,6 +18,11 @@
 from easy_rec.python.utils.config_util import set_eval_input_path
 from easy_rec.python.utils.config_util import set_train_input_path
 
+if tf.__version__.startswith('1.'):
+  from tensorflow.python.platform import gfile
+else:
+  import tensorflow.io.gfile as gfile
+
 from easy_rec.python.utils.distribution_utils import set_tf_config_and_get_train_worker_num_on_ds  # NOQA
 
 if tf.__version__ >= '2.0':