Compare revisions

Dorothea Sommer · Dorothea Sommer · Dorothea Sommer · Dorothea Sommer · Dorothea Sommer · Dorothea Sommer
--- a/Pointnet_Example/submit_train.sh
+++ b/Pointnet_Example/submit_train.sh
 #!/bin/bash
 #SBATCH --job-name=train-forest-script
-#SBATCH -p gpu # request gpu node for the training
+#SBATCH -p gpu                       # request gpu node for the training
+#SBATCH -t 00:05:00                  # TODO: estimate the time you will need
+#SBATCH -G rtx5000                   # requesting specific GPU, run sinfo -p gpu --format=%N,%G # to see what is available
 #SBATCH --nodes=1                    # total number of nodes
 #SBATCH --ntasks=1                   # total number of tasks
-#SBATCH --time=00:02:00              # total run time limit (HH:MM:SS) # TODO: change this to 2 hours
 #SBATCH --mail-type=begin            # send mail when job begins
 #SBATCH --mail-type=end              # send mail when job ends
 #SBATCH --mail-user=dorothea.sommer@gwdg.de # TODO: change this to your mailaddress!

-# If you need a specific GPU:
-# sinfo -p gpu --format=%N,%G # to see what is available
-# You can request specific GPUs with e.g. #SBATCH -G gtx1080:1
-
 # Prepare the environment.
 module load python/3.9
 module load anaconda3

--- a/Pointnet_Example/train.py
+++ b/Pointnet_Example/train.py
@@ -52,7 +52,7 @@ def create_dataloader(data_folder : str, validation_percentage = 0.15, verbose=T
    
    return train_loader, val_loader
    
-def train(model, train_loader, val_loader, num_training_epochs = 100, save_model=False):
+def train(model, train_loader, val_loader, optimizer, num_training_epochs = 100, saved_models_path=None):
    """The training loop."""
    for epoch in range(num_training_epochs):
        model.train()
@@ -97,16 +97,17 @@ def train(model, train_loader, val_loader, num_training_epochs = 100, save_model
                percent_correct = total_correct / total_pred * 100
                print(f"Epoch {epoch + 1} | validation accuracy {percent_correct}")

-        if epoch % 10 == 9 and save_model: # save model every 10 epochs
+        if saved_models_path:
            model_name = "tree_pointnet"
            curr_time = "{0:%Y-%m-%d--%H-%M-%S}".format(datetime.datetime.now())
            save_model_in_file = model_name + "-epoch-" + str(epoch) + "-time-" + curr_time + ".pt"
            torch.save(model.state_dict(), os.path.join(saved_models_path, save_model_in_file))
        
-    model_name = "tree_pointnet"
-    curr_time = "{0:%Y-%m-%d--%H-%M-%S}".format(datetime.datetime.now())
-    save_model_in_file = model_name + "-epoch-" + "final" + "-time-" + curr_time + ".pt"
-    torch.save(model.state_dict(), os.path.join(saved_models_path, save_model_in_file))
+    if saved_models_path:
+        model_name = "tree_pointnet"
+        curr_time = "{0:%Y-%m-%d--%H-%M-%S}".format(datetime.datetime.now())
+        save_model_in_file = model_name + "-epoch-" + "final" + "-time-" + curr_time + ".pt"
+        torch.save(model.state_dict(), os.path.join(saved_models_path, save_model_in_file))
        
    return model

@@ -117,6 +118,8 @@ if __name__ == "__main__":
    
    n_classes = len(TREE_SPECIES)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # GPU available?
+    print(f"Training with: {device}")
+    
    saved_models_path = "./saved_models"
    if not os.path.exists(saved_models_path):
        os.makedirs(saved_models_path)
@@ -142,6 +145,6 @@ if __name__ == "__main__":
    optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

    ### TRAINING LOOP ###
-    model = train(model, train_loader, val_loader, num_training_epochs = num_training_epochs, save_model=True)
+    model = train(model, train_loader, val_loader, optimizer, num_training_epochs = num_training_epochs, saved_models_path=saved_models_path)
    
    print("Finished training.")
\ No newline at end of file
No results found