Database ODBC channel identifier returned from a call to RODM_open_dbms_connection
data_table_name
Database table/view containing the training dataset.
case_id_column_name
Row unique case identifier in data_table_name.
target_column_name
Target column name in data_table_name.
model_name
ODM Model name.
auto_data_prep
Whether or not ODM should invoke automatic data preparation for the build.
class_priors
User-specified priors for the target classes.
retrieve_outputs_to_R
Flag controlling if the output results are moved to the R environment.
leave_model_in_dbms
Flag controlling if the model is deleted or left in RDBMS.
sql.log.file
File where to append the log of all the SQL calls made by this function.
Details
Naive Bayes (NB) for classification makes predictions using Bayes'
Theorem assuming that each attribute is conditionally independent of
the others given a particular value of the target (Duda, Hart and
Stork 2000). NB provides a very flexible general classifier for fast
model building and scoring that can be used for both binary and
multi-class classification problems.
For more details on the algotithm implementation, parameters settings and
characteristics of the ODM function itself consult the following Oracle documents: ODM Concepts,
ODM Application Developer's Guide, Oracle SQL Packages: Data Mining, and Oracle Database SQL Language
Reference (Data Mining functions), listed in the references below.
Value
If retrieve_outputs_to_R is TRUE, returns a list with the following elements:
# Predicting survival in the sinking of the Titanic based on pasenger's sex, age, class, etc.
## Not run:
DB <- RODM_open_dbms_connection(dsn="orcl11g", uid= "rodm", pwd = "rodm")
data(titanic3, package="PASWR") # Load survival data from Titanic
ds <- titanic3[,c("pclass", "survived", "sex", "age", "fare", "embarked")] # Select subset of attributes
ds[,"survived"] <- ifelse(ds[,"survived"] == 1, "Yes", "No") # Rename target values
n.rows <- length(ds[,1]) # Number of rows
set.seed(seed=6218945)
random_sample <- sample(1:n.rows, ceiling(n.rows/2)) # Split dataset randomly in train/test subsets
titanic_train <- ds[random_sample,] # Training set
titanic_test <- ds[setdiff(1:n.rows, random_sample),] # Test set
RODM_create_dbms_table(DB, "titanic_train") # Push the training table to the database
RODM_create_dbms_table(DB, "titanic_test") # Push the testing table to the database
# If the target distribution does not reflect the actual distribution due
# to specialized sampling, specify priors for the model
priors <- data.frame(
target_value = c("Yes", "No"),
prior_probability = c(0.1, 0.9))
# Create an ODM Naive Bayes model
nb <- RODM_create_nb_model(
database = DB, # Database ODBC channel identifier
model_name = "titanic_nb_model", # ODM model name
data_table_name = "titanic_train", # (in quotes) Data frame or database table containing the input dataset
class_priors = priors, # user-specified priors
target_column_name = "survived") # Target column name in data_table_name
# Predict test data using the Naive Bayes model
nb2 <- RODM_apply_model(
database = DB, # Database ODBC channel identifier
data_table_name = "titanic_test", # Database table containing the input dataset
model_name = "titanic_nb_model", # ODM model name
supplemental_cols = "survived") # Carry the target column to the output for analysis
# Compute contingency matrix, performance statistics and ROC curve
print(nb2$model.apply.results[1:10,]) # Print example of prediction results
actual <- nb2$model.apply.results[, "SURVIVED"]
predicted <- nb2$model.apply.results[, "PREDICTION"]
probs <- as.real(as.character(nb2$model.apply.results[, "'Yes'"]))
table(actual, predicted, dnn = c("Actual", "Predicted")) # Confusion matrix
library(verification)
perf.auc <- roc.area(ifelse(actual == "Yes", 1, 0), probs) # Compute ROC and plot
auc.roc <- signif(perf.auc$A, digits=3)
auc.roc.p <- signif(perf.auc$p.value, digits=3)
roc.plot(ifelse(actual == "Yes", 1, 0), probs, binormal=T, plot="both", xlab="False Positive Rate",
ylab="True Postive Rate", main= "Titanic survival ODM NB model ROC Curve")
text(0.7, 0.4, labels= paste("AUC ROC:", signif(perf.auc$A, digits=3)))
text(0.7, 0.3, labels= paste("p-value:", signif(perf.auc$p.value, digits=3)))
nb # look at the model details
RODM_drop_model(DB, "titanic_nb_model") # Drop the model
RODM_drop_dbms_table(DB, "titanic_train") # Drop the training table in the database
RODM_drop_dbms_table(DB, "titanic_test") # Drop the testing table in the database
RODM_close_dbms_connection(DB)
## End(Not run)