Gitlab Community Edition Instance

Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • main
1 result

Target

Select target project
  • hpc-team-public/workshop-forests-in-hpc
1 result
Select Git revision
  • main
1 result
Show changes
Commits on Source (7)
#!/bin/bash
#SBATCH --job-name=train-forest-script
#SBATCH -p gpu # request gpu node for the training
#SBATCH -p gpu # request gpu node for the training
#SBATCH -t 00:05:00 # TODO: estimate the time you will need
#SBATCH -G rtx5000 # requesting specific GPU, run sinfo -p gpu --format=%N,%G # to see what is available
#SBATCH --nodes=1 # total number of nodes
#SBATCH --ntasks=1 # total number of tasks
#SBATCH --time=00:02:00 # total run time limit (HH:MM:SS) # TODO: change this to 2 hours
#SBATCH --mail-type=begin # send mail when job begins
#SBATCH --mail-type=end # send mail when job ends
#SBATCH --mail-user=dorothea.sommer@gwdg.de # TODO: change this to your mailaddress!
# If you need a specific GPU:
# sinfo -p gpu --format=%N,%G # to see what is available
# You can request specific GPUs with e.g. #SBATCH -G gtx1080:1
# Prepare the environment.
module load python/3.9
module load anaconda3
......
......@@ -52,7 +52,7 @@ def create_dataloader(data_folder : str, validation_percentage = 0.15, verbose=T
return train_loader, val_loader
def train(model, train_loader, val_loader, num_training_epochs = 100, save_model=False):
def train(model, train_loader, val_loader, optimizer, num_training_epochs = 100, saved_models_path=None):
"""The training loop."""
for epoch in range(num_training_epochs):
model.train()
......@@ -97,16 +97,17 @@ def train(model, train_loader, val_loader, num_training_epochs = 100, save_model
percent_correct = total_correct / total_pred * 100
print(f"Epoch {epoch + 1} | validation accuracy {percent_correct}")
if epoch % 10 == 9 and save_model: # save model every 10 epochs
if saved_models_path:
model_name = "tree_pointnet"
curr_time = "{0:%Y-%m-%d--%H-%M-%S}".format(datetime.datetime.now())
save_model_in_file = model_name + "-epoch-" + str(epoch) + "-time-" + curr_time + ".pt"
torch.save(model.state_dict(), os.path.join(saved_models_path, save_model_in_file))
model_name = "tree_pointnet"
curr_time = "{0:%Y-%m-%d--%H-%M-%S}".format(datetime.datetime.now())
save_model_in_file = model_name + "-epoch-" + "final" + "-time-" + curr_time + ".pt"
torch.save(model.state_dict(), os.path.join(saved_models_path, save_model_in_file))
if saved_models_path:
model_name = "tree_pointnet"
curr_time = "{0:%Y-%m-%d--%H-%M-%S}".format(datetime.datetime.now())
save_model_in_file = model_name + "-epoch-" + "final" + "-time-" + curr_time + ".pt"
torch.save(model.state_dict(), os.path.join(saved_models_path, save_model_in_file))
return model
......@@ -117,6 +118,8 @@ if __name__ == "__main__":
n_classes = len(TREE_SPECIES)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # GPU available?
print(f"Training with: {device}")
saved_models_path = "./saved_models"
if not os.path.exists(saved_models_path):
os.makedirs(saved_models_path)
......@@ -142,6 +145,6 @@ if __name__ == "__main__":
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
### TRAINING LOOP ###
model = train(model, train_loader, val_loader, num_training_epochs = num_training_epochs, save_model=True)
model = train(model, train_loader, val_loader, optimizer, num_training_epochs = num_training_epochs, saved_models_path=saved_models_path)
print("Finished training.")
\ No newline at end of file