257 lines
10 KiB
Python
257 lines
10 KiB
Python
# Copyright (c) OpenMMLab. All rights reserved.
|
|
import datetime
|
|
import os
|
|
import os.path as osp
|
|
from collections import OrderedDict
|
|
|
|
import torch
|
|
import torch.distributed as dist
|
|
|
|
import annotator.mmpkg.mmcv as mmcv
|
|
from annotator.mmpkg.mmcv.fileio.file_client import FileClient
|
|
from annotator.mmpkg.mmcv.utils import is_tuple_of, scandir
|
|
from ..hook import HOOKS
|
|
from .base import LoggerHook
|
|
|
|
|
|
@HOOKS.register_module()
|
|
class TextLoggerHook(LoggerHook):
|
|
"""Logger hook in text.
|
|
|
|
In this logger hook, the information will be printed on terminal and
|
|
saved in json file.
|
|
|
|
Args:
|
|
by_epoch (bool, optional): Whether EpochBasedRunner is used.
|
|
Default: True.
|
|
interval (int, optional): Logging interval (every k iterations).
|
|
Default: 10.
|
|
ignore_last (bool, optional): Ignore the log of last iterations in each
|
|
epoch if less than :attr:`interval`. Default: True.
|
|
reset_flag (bool, optional): Whether to clear the output buffer after
|
|
logging. Default: False.
|
|
interval_exp_name (int, optional): Logging interval for experiment
|
|
name. This feature is to help users conveniently get the experiment
|
|
information from screen or log file. Default: 1000.
|
|
out_dir (str, optional): Logs are saved in ``runner.work_dir`` default.
|
|
If ``out_dir`` is specified, logs will be copied to a new directory
|
|
which is the concatenation of ``out_dir`` and the last level
|
|
directory of ``runner.work_dir``. Default: None.
|
|
`New in version 1.3.16.`
|
|
out_suffix (str or tuple[str], optional): Those filenames ending with
|
|
``out_suffix`` will be copied to ``out_dir``.
|
|
Default: ('.log.json', '.log', '.py').
|
|
`New in version 1.3.16.`
|
|
keep_local (bool, optional): Whether to keep local log when
|
|
:attr:`out_dir` is specified. If False, the local log will be
|
|
removed. Default: True.
|
|
`New in version 1.3.16.`
|
|
file_client_args (dict, optional): Arguments to instantiate a
|
|
FileClient. See :class:`mmcv.fileio.FileClient` for details.
|
|
Default: None.
|
|
`New in version 1.3.16.`
|
|
"""
|
|
|
|
def __init__(self,
|
|
by_epoch=True,
|
|
interval=10,
|
|
ignore_last=True,
|
|
reset_flag=False,
|
|
interval_exp_name=1000,
|
|
out_dir=None,
|
|
out_suffix=('.log.json', '.log', '.py'),
|
|
keep_local=True,
|
|
file_client_args=None):
|
|
super(TextLoggerHook, self).__init__(interval, ignore_last, reset_flag,
|
|
by_epoch)
|
|
self.by_epoch = by_epoch
|
|
self.time_sec_tot = 0
|
|
self.interval_exp_name = interval_exp_name
|
|
|
|
if out_dir is None and file_client_args is not None:
|
|
raise ValueError(
|
|
'file_client_args should be "None" when `out_dir` is not'
|
|
'specified.')
|
|
self.out_dir = out_dir
|
|
|
|
if not (out_dir is None or isinstance(out_dir, str)
|
|
or is_tuple_of(out_dir, str)):
|
|
raise TypeError('out_dir should be "None" or string or tuple of '
|
|
'string, but got {out_dir}')
|
|
self.out_suffix = out_suffix
|
|
|
|
self.keep_local = keep_local
|
|
self.file_client_args = file_client_args
|
|
if self.out_dir is not None:
|
|
self.file_client = FileClient.infer_client(file_client_args,
|
|
self.out_dir)
|
|
|
|
def before_run(self, runner):
|
|
super(TextLoggerHook, self).before_run(runner)
|
|
|
|
if self.out_dir is not None:
|
|
self.file_client = FileClient.infer_client(self.file_client_args,
|
|
self.out_dir)
|
|
# The final `self.out_dir` is the concatenation of `self.out_dir`
|
|
# and the last level directory of `runner.work_dir`
|
|
basename = osp.basename(runner.work_dir.rstrip(osp.sep))
|
|
self.out_dir = self.file_client.join_path(self.out_dir, basename)
|
|
runner.logger.info(
|
|
(f'Text logs will be saved to {self.out_dir} by '
|
|
f'{self.file_client.name} after the training process.'))
|
|
|
|
self.start_iter = runner.iter
|
|
self.json_log_path = osp.join(runner.work_dir,
|
|
f'{runner.timestamp}.log.json')
|
|
if runner.meta is not None:
|
|
self._dump_log(runner.meta, runner)
|
|
|
|
def _get_max_memory(self, runner):
|
|
device = getattr(runner.model, 'output_device', None)
|
|
mem = torch.cuda.max_memory_allocated(device=device)
|
|
mem_mb = torch.tensor([mem / (1024 * 1024)],
|
|
dtype=torch.int,
|
|
device=device)
|
|
if runner.world_size > 1:
|
|
dist.reduce(mem_mb, 0, op=dist.ReduceOp.MAX)
|
|
return mem_mb.item()
|
|
|
|
def _log_info(self, log_dict, runner):
|
|
# print exp name for users to distinguish experiments
|
|
# at every ``interval_exp_name`` iterations and the end of each epoch
|
|
if runner.meta is not None and 'exp_name' in runner.meta:
|
|
if (self.every_n_iters(runner, self.interval_exp_name)) or (
|
|
self.by_epoch and self.end_of_epoch(runner)):
|
|
exp_info = f'Exp name: {runner.meta["exp_name"]}'
|
|
runner.logger.info(exp_info)
|
|
|
|
if log_dict['mode'] == 'train':
|
|
if isinstance(log_dict['lr'], dict):
|
|
lr_str = []
|
|
for k, val in log_dict['lr'].items():
|
|
lr_str.append(f'lr_{k}: {val:.3e}')
|
|
lr_str = ' '.join(lr_str)
|
|
else:
|
|
lr_str = f'lr: {log_dict["lr"]:.3e}'
|
|
|
|
# by epoch: Epoch [4][100/1000]
|
|
# by iter: Iter [100/100000]
|
|
if self.by_epoch:
|
|
log_str = f'Epoch [{log_dict["epoch"]}]' \
|
|
f'[{log_dict["iter"]}/{len(runner.data_loader)}]\t'
|
|
else:
|
|
log_str = f'Iter [{log_dict["iter"]}/{runner.max_iters}]\t'
|
|
log_str += f'{lr_str}, '
|
|
|
|
if 'time' in log_dict.keys():
|
|
self.time_sec_tot += (log_dict['time'] * self.interval)
|
|
time_sec_avg = self.time_sec_tot / (
|
|
runner.iter - self.start_iter + 1)
|
|
eta_sec = time_sec_avg * (runner.max_iters - runner.iter - 1)
|
|
eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
|
|
log_str += f'eta: {eta_str}, '
|
|
log_str += f'time: {log_dict["time"]:.3f}, ' \
|
|
f'data_time: {log_dict["data_time"]:.3f}, '
|
|
# statistic memory
|
|
if torch.cuda.is_available():
|
|
log_str += f'memory: {log_dict["memory"]}, '
|
|
else:
|
|
# val/test time
|
|
# here 1000 is the length of the val dataloader
|
|
# by epoch: Epoch[val] [4][1000]
|
|
# by iter: Iter[val] [1000]
|
|
if self.by_epoch:
|
|
log_str = f'Epoch({log_dict["mode"]}) ' \
|
|
f'[{log_dict["epoch"]}][{log_dict["iter"]}]\t'
|
|
else:
|
|
log_str = f'Iter({log_dict["mode"]}) [{log_dict["iter"]}]\t'
|
|
|
|
log_items = []
|
|
for name, val in log_dict.items():
|
|
# TODO: resolve this hack
|
|
# these items have been in log_str
|
|
if name in [
|
|
'mode', 'Epoch', 'iter', 'lr', 'time', 'data_time',
|
|
'memory', 'epoch'
|
|
]:
|
|
continue
|
|
if isinstance(val, float):
|
|
val = f'{val:.4f}'
|
|
log_items.append(f'{name}: {val}')
|
|
log_str += ', '.join(log_items)
|
|
|
|
runner.logger.info(log_str)
|
|
|
|
def _dump_log(self, log_dict, runner):
|
|
# dump log in json format
|
|
json_log = OrderedDict()
|
|
for k, v in log_dict.items():
|
|
json_log[k] = self._round_float(v)
|
|
# only append log at last line
|
|
if runner.rank == 0:
|
|
with open(self.json_log_path, 'a+') as f:
|
|
mmcv.dump(json_log, f, file_format='json')
|
|
f.write('\n')
|
|
|
|
def _round_float(self, items):
|
|
if isinstance(items, list):
|
|
return [self._round_float(item) for item in items]
|
|
elif isinstance(items, float):
|
|
return round(items, 5)
|
|
else:
|
|
return items
|
|
|
|
def log(self, runner):
|
|
if 'eval_iter_num' in runner.log_buffer.output:
|
|
# this doesn't modify runner.iter and is regardless of by_epoch
|
|
cur_iter = runner.log_buffer.output.pop('eval_iter_num')
|
|
else:
|
|
cur_iter = self.get_iter(runner, inner_iter=True)
|
|
|
|
log_dict = OrderedDict(
|
|
mode=self.get_mode(runner),
|
|
epoch=self.get_epoch(runner),
|
|
iter=cur_iter)
|
|
|
|
# only record lr of the first param group
|
|
cur_lr = runner.current_lr()
|
|
if isinstance(cur_lr, list):
|
|
log_dict['lr'] = cur_lr[0]
|
|
else:
|
|
assert isinstance(cur_lr, dict)
|
|
log_dict['lr'] = {}
|
|
for k, lr_ in cur_lr.items():
|
|
assert isinstance(lr_, list)
|
|
log_dict['lr'].update({k: lr_[0]})
|
|
|
|
if 'time' in runner.log_buffer.output:
|
|
# statistic memory
|
|
if torch.cuda.is_available():
|
|
log_dict['memory'] = self._get_max_memory(runner)
|
|
|
|
log_dict = dict(log_dict, **runner.log_buffer.output)
|
|
|
|
self._log_info(log_dict, runner)
|
|
self._dump_log(log_dict, runner)
|
|
return log_dict
|
|
|
|
def after_run(self, runner):
|
|
# copy or upload logs to self.out_dir
|
|
if self.out_dir is not None:
|
|
for filename in scandir(runner.work_dir, self.out_suffix, True):
|
|
local_filepath = osp.join(runner.work_dir, filename)
|
|
out_filepath = self.file_client.join_path(
|
|
self.out_dir, filename)
|
|
with open(local_filepath, 'r') as f:
|
|
self.file_client.put_text(f.read(), out_filepath)
|
|
|
|
runner.logger.info(
|
|
(f'The file {local_filepath} has been uploaded to '
|
|
f'{out_filepath}.'))
|
|
|
|
if not self.keep_local:
|
|
os.remove(local_filepath)
|
|
runner.logger.info(
|
|
(f'{local_filepath} was removed due to the '
|
|
'`self.keep_local=False`'))
|