Ry2an's Studio

SAS codes

Word count: 5.1kReading time: 30 min
2020/08/15 Share

This file contains all SAS orders that I used and able to use in future.

Data Importing

连接路径设置library

1
2
/*Set library*/
LIBNAME assign "D:\biossas\assign1";

建立新数据集并设置一些变量

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
data work.Cat2;
set datasets.Cat;

if RaceEthnicity = 1 then NHWH = 1; Else NHWH = 0;
if RaceEthnicity = 2 then NHBL = 1; Else NHBL = 0;
if RaceEthnicity = 3 then HSPA = 1; Else HSPA = 0;
if RaceEthnicity = 4 then OTHE = 1; Else OTHE = 0;

if RaceEthnicity = . then NHWH = .;
if RaceEthnicity = . then NHBL = .;
if RaceEthnicity = . then HSPA = .;
if RaceEthnicity = . then OTHE = .;

label NHWH = Non-Hispanic White;
label NHBL = Non-Hispanic Black;
label HSPA = Hispanic;
label OTHE = Others;
run;

/* Step 3. Recode Education as three new binary variables*/
data work.categorical2;
set datasets.categorical1;

if Education=1 then NoHS=1; Else NoHS=0;
if Education=2 then HS=1;Else HS=0;
if Education=3 then College=1;Else College=0;

if Education=. then NoHS=.;
if Education=. then HS=.;
if Education=. then College=.;

label NoHS = Not graduated from HS;
label HS = Graduated from HS;
label College = Graduated from College;
run;

建立手动输入的数据表

1
2
3
4
5
6
7
8
9
data cohort;
input exposed outcome count; * name of columns
datalines ;
1 1 11
1 2 160
2 1 616
2 2 10020
;
run;

导入csv

1
2
3
4
5
6
7
8
9
/*Importing a text (csv) data file (assuming the model folder system). */
* Enter the address of the folder in which you saved DRfrenchM.csv
(e.g., the instructor saved the file in C:\Users\shoriuchi\Desktop\Bios1 SAS files\introduction ) ;
proc import
out = work.drfrenchm
datafile = "*** enter the file address here ***\DRfrenchM.csv"
dbms = excel;
sheet = "death rates for French males";
run;

导入xls

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
/*Importing an Excel data file (assuming the model folder system). */
* Enter the address of the folder in which you saved DRfrenchM.xls
(e.g., the instructor saved the file in C:\Users\shoriuchi\Desktop\Bios1 SAS files\introduction ) ;
proc import
out = work.drfrenchm
datafile = "*** enter the file address here ***\DRfrenchM.xls"
dbms = excel;
sheet = "death rates for French males";
run;

proc import DATAFILE = "R:\NYU_CHES\EPIC Clarity\Received 04112019\Mothers_2019-04-11.xlsx"
OUT = addressout
DBMS = xlsx
replace;
sheet = "1. Address";
getnames = yes;
run;

导出csv

1
2
3
4
PROC EXPORT DATA= WORK.RANDOM200 
OUTFILE= 'F:\Epi 2\random200.csv'
REPLACE;
RUN;

导出xls

1
2
3
4
PROC EXPORT DATA= WORK.RANDOM200 
OUTFILE= 'F:\Epi 2\random200.xls'
DBMS=EXCEL4 REPLACE;
RUN;

显示数据集 print

1
2
3
4
5
6
proc print data=epi2.test;
run;

proc print data=epi2.test;
variables temp ph;
run;

禁用format

1
2
3
4
5
6
options nofmterr;

options nodate nofmterr linesize=100 formdlim='-';
ods trace on;

options nofmterr;

建立table

1
2
3
4
5
6
7
8
9
data cohort;
input exposed outcome count;
datalines ;
1 1 11
1 2 160
2 1 616
2 2 10020
;
run;

建立一种format

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
proc format lib=library;
value hypg 1="Has hyperglycemia"
0="No hyperglycemia";
run;

proc format;
value yno 0='No' 1='Yes';
value gen 0='Male' 1='Female';
value inc 1='<$70/month' 2='$70-$99/month' 3='$100/month or more';
value rac 1='White' 2='Mestizo' 3='Indigenous' 4='African';
run;

proc format;
value agegrp
1 = '1 45-90'
2 = '2 17-44';
value sex
1 = '1 Male'
2 = '2 Female';
value yntwo
1 = '1 Yes'
2 = '2 No';
value ynzero
1 = '1 Yes'
0 = '0 No';
run;

为数据集使用format

1
2
3
4
5
6
7
8
9
10
11
data work.rn2a; set library.rn2;
hyperglycemia = .;
if (LBXGLU >= 110) then hyperglycemia=1;
if (. < LBXGLU < 110) then hyperglycemia=0;
label hyperglycemia = "Hyperglycemia (fasting glucose >= 110 mg/dL)";
format hyperglycemia hypg.;
run;

data exer1; set lab.exer1;
format anemia parasite yno. income inc. race rac. gender gen.;
run;

修改变量label

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
DATA TEMP; 
SET EPI.CONFOUNDING_DS;

LABEL EXPOS12 = "Exposure variable recoded as 1 exposed and 2 unexposed";
EXPOS12 = 2 - EXPOSURE;

LABEL ASTHMA12 = "Asthma status recoded as 1 case and 2 control";
ASTHMA12 = 2 - ASTHMA;

LABEL AGE5= "Age recoded as 1>=5 and 2 <5";
AGE5= 2- AGE;

LABEL SPRING = "Spring variable = 1, not spring = 2";
SPRING = 2 - SEASON;
run;

时间变量读取year hour

1
2
3
4
5
6
7
8
9
data test_year2;
set Testdataredcap;
where year(Date) > 2020;
run;

data test_year4;
set Testdataredcap;
where hour(time) > 20;
run;

在有比重的表格中使用比重

1
2
3
4
5
6
7
8
9
10
11
12
*2x2 table;
*Here, exposed=1 means exposed, and exposed=2 means unexposed;
*Similarly, outcome=1 means D+, and outcome=2 means D-;
proc freq data=cohort;
tables exposed*outcome/relrisk riskdiff;
weight count;
run;

proc logistic data=cohort desc;
model outcome=exposed;
weight count; * weight here means the number of people
run;

在逻辑回归中使用抽样的权重 surveylogistic weight

1
2
3
4
proc surveylogistic data = use_dat2;
weight EXAM_WT;
model obese_10 = mental_10 SPAGE sex_10 mental_10*sex_10;
run;

Basic Data Summary

显示变量

1
2
3
/*See Variables*/
proc contents data = assign.sbpandkidney;
run;

显示头几行数据 head()

1
2
proc print data=a (obs=100);
run;

排序数据 sort

1
2
3
4
5
6
proc sort data = TEMP; by age; run;

proc sort data = Addressout
OUT = Addressout;
BY MOM_MRN;
run;

写出排名 proc rank

1
2
3
4
proc rank data=est_prob out=est_prob groups=10;
var ps;
ranks ps_dec;
proc sort; by ps_dec; run;

读取上面一格和下面一格的值

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
data Addressout;
set Addressout;
judge = 0;
run;

data Addressout;
set Addressout;
by MOM_MRN;
judge = ^(first.MOM_MRN and last.MOM_MRN);
run;

data Addressout;
set Addressout;
where judge = 0;
drop judge;
run;

SQL 嵌套

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
proc sql;
create table as
select a.level China USA
from resdat.china as a
inner join
resdat.usa as b
on a.level=b.level
;
select * from pingpang;
quit;

/*
output
level china usa
1 c02 u00
2 c03 u02
2 c03 u01
3 c04 u03
*/

Descriptive Statistics

显示变量频数 (不要用于连续型变量)(统计缺失值数量)(stratified table)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
proc freq data=work.Cat2;
tables RaceEthnicity NHWH NHBL HSPA OTHE;
run;

/* Step 4. Examination of results of the recoding.*/
proc freq data=work.categorical2;
tables Education NoHS HS College;
run;
*The 'missing' option creates a separate row/column for missing data;

-----------------------

proc freq data = exer1;
title 'Parasitic infection';
table parasite / missing;
run;

--------------------

proc freq data = TEMP;
table EXPOS12 * EXPOSURE / list missing;
table ASTHMA12 * ASTHMA / list missing;
table age5 * age /list missing;
run;

-------------------

proc freq data = TEMP;
table AGE5 * EXPOS12 * ASTHMA12 /relrisk cmh; *add stratifying variable to table statement;
run; *order should be 1 stratifying var, 2 exposure var and 3 outcome var;

显示连续型变量的最小值最大值均值标准差中位数百分位数

1
2
3
4
*Examine the continuous variable 'BMI'; 
proc means data = exer1 n min max mean std median P25 P75;
var bmi;
run;

显示均值meas

1
2
3
4
5
6
7
8
9
/*Step 2. Descriptive statistics for the variables.*/
proc means data = work.drfrenchm;
var Year Age Death_Rate ln_Death_Rate;
run;

proc means data = exer1 n min max mean std median P25 P75;
title 'BMI';
var bmi;
run;

筛选数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
data work.rn2b;
set work.rn2a;
if (MaritalStatus = 1) or (MaritalStatus = 5);
run;

proc logistic data = TEMP DESC;
where spring=2;
model ASTHMA = EXPOSURE ;
run;
*Association among spring births;
proc logistic data = TEMP DESC;
where spring=1;
model ASTHMA = EXPOSURE ;
run;

计算OR RR RD

1
2
3
4
5
proc freq data = Rnone.Rn1agesq;
table Sumldl * Male / cmh relrisk riskdiff;
*last two keywords provide PR or RR and AR depending on the study design;
*cmh requests risks and odds;
run;

Odds ratio (logistic regression)

1
2
3
4
5
*Odds ratio (logistic regression); 
proc logistic data = exer1 descending;
title 'OR: Gender by parasite';
model parasite = gender;
run;

Risk Ratio

1
2
3
4
proc genmod Data=new desc ;
model fstat=gender/link=log dist=bin;
estimate 'Beta' gender 1 / exp;
run;

卡方检验

1
2
3
proc freq data = Rnone.Rn1agesq;
table Sumldl * Sumpir / chisq;
run;

所有变量correlation

1
2
3
4
5
6
7
8
/*corralation all var*/
proc corr data = assign.sbpandkidney;
run;

proc corr data=cars1 ;
VAR horsepower weight ;
BY make; /*like BY gender 分成几组分别cor*/
run;

Cronbach Alpha

1
2
3
4
5
6
7
8
9
10
11
12
13
14
proc univariate data = use_dat;
var MHQ_1;
histogram MHQ_1 / vscale=count barlabel=count;
run;

proc corr data=temp alpha nocorr nomiss;
VAR temp_code MHQ_2 MHQ_3 MHQ_4 MHQ_5 MHQ_6 MHQ_7;
*title1 'Some title';
run;

proc corr data=temp alpha;
VAR temp_code MHQ_2 MHQ_3 MHQ_4 MHQ_5 MHQ_6 MHQ_7;
*title1 'Some title';
run;

Pearson and Spearman correlations;

1
2
3
4
5
*Pearson and Spearman correlations; 
proc corr data = exer1 pearson spearman;
title 'BMI and cholesterol';
var bmi chol;
run;

Plotting

Xy散点图 (4 methods)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
/*设置图形属性*/
goptions reset=all;
symbol1 value=dot color=black height=1;

---------------------------

/*plot SBP x BUN(ABC*/
proc plot data = assign.sbpandkidney;
plot BPXSY1*LBXSBU;
run;
quit;

/* Step 4b. Scatterplot of residuals by predicted BMI values. (真实散点*/
/* (We want to get a finer graph than that produced by [proc plot].) */
goptions reset=all;
symbol1 value=dot color=black height=1;
proc gplot data=work.ResidualsBMI;
plot RESIDUAL*PREDICTED;
run;

proc sgplot data=lab10.ecological;
SCATTER X = foreign
Y = fastfood;
run;

PROC SGSCATTER DATA = lab10.ecological;
PLOT fastfood*foreign;
TITLE 'Percent of foreign born population and fast food';
run;

Plot Curve

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
data ratiopower;
input ratio power;
cards;
1 .596
2 .732
3 .785
4 .812
5 .829
6 .839
7 .847
8 .853
9 .858
10 .861
;
run;

proc gplot data=ratiopower;
plot power*ratio; symbol line=1 width=5 interpol=join;
title 'Power curve with different cc ratios';
run;

条形图 sgplot

1
2
3
4
5
6
7
8
9
10
11
proc SGPLOT data = work.tides;
vbar center;
run;

proc sgplot data=tmp3 sganno=sganno noautolegend ;
format count0 count1 positive. ;
vbar ps / response=count0 legendlabel=" " datalabel datalabelattrs=(size=10) name='a0' nofill ;
vbar ps / response=count1 legendlabel=" " datalabel datalabelattrs=(size=10) name='a1' fill fillattrs=(color=gray) ;
xaxis labelattrs=(size=15) valueattrs=(size = 10) min = 0 max = 1.0;
yaxis labelattrs=(size=15) valueattrs=(size = 10) ;
run;

Histogram (check whether normal)

1
2
3
4
5
6
7
8
9
10
11
proc univariate normal data=epi2.nhanes3;
var age;
histogram age;
run;

------------------

proc univariate data = work.ResidualsBMI;
var RESIDUAL;
histogram RESIDUAL / normal;
run; quit;

Histogram

1
2
3
proc sgpanel data = work.tides;
histogram t1_appt_date;
run;

Regression

线性回归

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
/*regression only let BUN as X*/
proc reg data = assign.sbpandkidney;
model BPXSY1 = LBXSBU;
run;
/*regression let BUN and Age as X*/
proc reg data = assign.sbpandkidney;
model BPXSY1 = LBXSBU Exactage;
run;
quit;

/*Do a temp regression to find the highest negtive beta among races, this time reference 'Others' as default*/
proc reg data = work.Cat2;
model BPXSY1 = Male Exactage NHWH NHBL HSPA;
run;

/*NHWH was only one who got a negtive beta, thus we set it as the reference*/
proc reg data = work.Cat2;
model BPXSY1 = Male Exactage NHBL HSPA OTHE;
run;
quit;

----------------------------

proc genmod data = nhefs;
class exercise active education;
model wt82_71 = qsmk sex race age age*age education
smokeintensity smokeintensity*smokeintensity smokeyrs smokeyrs*smokeyrs
exercise active wt71 wt71*wt71
qsmk*smokeintensity;
run;
quit;

线性回归并计算残差和回归均方

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
/* Step 2. Linear regression of systolic blood pressure 
on gender, age, education, race/ethnicity and marital status.
Predicted values and residuals are saved in a dataset named ResidualsSBP. */
proc reg data=library.rn1;
model BPXSY1 = Male Exactage NH_Black Hispanic Others NoHS graduatedHS Widowed Divorced_or_separated Never_married Living_with_partner;
output out = work.ResidualsSBP
P = PREDICTED /* unstandardized predicted value*/
R = RESIDUAL; /* unstandardized residuals*/
run;
quit;

/* Step 3. Correlations between observed SBP, predicted SBP and residuals. */
proc corr data = work.ResidualsSBP;
var BPXSY1 PREDICTED RESIDUAL;
run;

/* Step 4. Variances of observed SBP, predicted SBP and residuals. */
proc means data = work.ResidualsSBP var;
var BPXSY1 PREDICTED RESIDUAL;
run;

线性回归检测分类变量设定

1
2
3
4
5
6
proc reg data=datasets.rn1a;
model BPXDI1 = Male Exactage Agesquared NH_Black Hispanic Others NoHS graduatedHS;
Diff_raceethnicity: test NH_Black=Hispanic=Others=0;
Diff_education: test NoHS=graduatedHS=0;
run;
quit;

General Linear Model

1
2
3
4
5
6
7
8
9
10
11
12
*ANOVA test for comparing a continuous and normally distributed variable across a categorical variable with 2+ categories;
proc glm data = exer1;
title 'ANOVA: BMI by race';
class race; *4 category race variable;
model bmi = race;
means race;
run;

proc glm data= est_prob;
class ps_dec;
model wt82_71= qsmk ps_dec /clparm solution; /*these options use for showing 95% CI*/
run;

残差分布

1
2
3
4
5
6
/* Step 5. Histogram of residuals in comparison with the corresponding normal distribution. */
/* (We want to get a larger graph than that in the default graphic output.) */
proc univariate data = work.ResidualsBMI;
var RESIDUAL;
histogram RESIDUAL / normal;
run; quit;

Univariate Analysis

检测是否正态(qq-plot)

1
2
3
4
5
6
proc sort; by parasite; run;
proc univariate normal;
var bmi;
qqplot bmi / normal ;
by parasite;
run;

单样本t检验

1
2
3
4
5
6
7
8
9
proc ttest data=cars1 alpha=0.05 h0=0;
var horsepower;
run;

proc ttest data = exer1;
title 'BMI by parasite';
class parasite;
var bmi;
run;

配对样本t检验

1
2
3
proc ttest data=cars1 ;
paired weight*length;
run;

两样本t检验

1
2
3
4
5
proc ttest data=cars1 sides=2 alpha=0.05 h0=0;
title "Two sample t-test example";
class make;
var horsepower;
run;

Fisher’s exact test

1
2
3
4
5
6
7
8
9
*Fisher's exact test;
*This is to demonstrate that the values of the dichotomous variables are arbitrary, but
be careful about interpreting the measures of association;
*Look at how the risk differences, the RRs, and the ORs change (or do not) depending on how you code the variables;
proc freq data = new;
title 'Fishers exact tests';
tables parasite*parasite12 anemia*anemia12;
tables parasite*anemia parasite12*anemia12 / chisq relrisk riskdiff fisher; *also obtained in previous PROC;
run;

Wilcoxon rank sum/Mann-Whitney test

1
2
3
4
5
6
7
*Wilcoxon rank sum/Mann-Whitney test for comparing a continuous variable across a categorical variable with 2 categories;
*Non-parametric form of the independent samples t-test;
proc npar1way data = exer1 wilcoxon;
title 'Wilcoxon: BMI by parasite';
class parasite; *2 category parasite variable;
var bmi;
run;

Kruskal-Wallis test

1
2
3
4
5
6
*Kruskal-Wallis test for comparing a continuous variable across a categorical variable with 2+ categories (non-parametric test); 
proc npar1way data = exer1 wilcoxon;
title 'K-W: BMI by race';
class race; *4 category race variable;
var bmi;
run;

One way anova

1
2
3
4
5
PROC ANOVA data = ***;
CLASS *分类变量*
MODEL *连续变量(因变量)* = *** *** ***; /*效应变量列表(不是值)*/
MEANS *分类变量*
RUN;

Logistic Regression

Logistic regression set reference

1
2
3
4
5
6
7
8
9
10
11
proc logistic data=library.rn2;
class RIAGENDR (ref='Male') RaceEthnicity (ref='Non-Hispanic White')
Education (ref='Graduated from College') smoking (ref="Never smoked (< 100 cigarettes)")
Physical_activity (ref='Did not do vigorous or moderate activity') / PARAM=ref;
model HSD010 (order=internal descending) = RIAGENDR Exactage RaceEthnicity Education
Smoking Physical_activity / expb;
* "order=internal" for following the order of values of HSD010 (First, "Excellent",
then "Very Good", ...").;
* "descending" for [from poor to excellent] instead of [from excellent to poor]
so probabilities and odds of being less healthy are computed;
run;

非线性回归并对categorical自变量的β值进行方差检验

1
2
3
4
5
6
7
/* Step 3. Regression of diastolic blood pressure on gender, age(quadratic function), race/ethnicity and education. */
proc reg data=library.rn1a;
model BPXDI1 = Male Exactage Agesquared NH_Black Hispanic Others NoHS graduatedHS;
Diff_raceethnicity: test NH_Black=Hispanic=Others=0;
Diff_education: test NoHS=graduatedHS=0;
run;
quit;

保存逻辑回归的预测值 surveylogistic

1
2
3
4
5
proc surveylogistic data = use_dat2;
OUTPUT OUT = use_dat3 PREDICTED = pscore;
weight EXAM_WT;
model mental_10 = diabete_10 hypert_10 asthma_10 kidney_10 cancer_10;
run;

Logistic回归并储存结果

1
2
3
4
5
6
/* Step 4. Logistic regression of self-reported general health condition on selected demographic variables */
proc logistic data=library.rn2;
class RIAGENDR (ref='Male') RaceEthnicity (ref='Non-Hispanic White') Education (ref='Graduated from College') MaritalStatus (ref='Married') / PARAM=ref;
model HealthCondition(DESC)=Exactage RIAGENDR RaceEthnicity Education MaritalStatus;
score out=work.logisticmodel;
run;

Why DESC?

The purposes of the analysis in the posted example was to set up and estimate a logistic regression model of the probability of having hyperglycemia. If the sentence under the Response Profile table of the SAS output is as shown below, the probability of having hyperglycemia was modelled.

However, if the sentence is as shown below, the probability of not having hyperglycemia was modelled. It needs to be switched to the probability of having hyperglycemia.

The event/condition whose probability is modelled can be switched between the two by adding “(DESC)” or “(DESCENDING)” after the dependent variable name in the “model” statement or by removing it:

1
>model hyperglycemia(DESC) = RIAGENDR Agegroup RaceEthnicity Education PIRgroup / expb;

or

1
>model hyperglycemia = RIAGENDR Agegroup RaceEthnicity Education PIRgroup / expb;

logistic 回归中变连续变量为有序变量

1
2
3
4
5
6
7
*** If we want to express the odds ratio, as every 5, 10, 15 or 20 year increase in age, we can add;
*** a units command and it will calculate the ORs and 95% CIs for us;

proc logistic data=lab3 descending;
model hypertension10 = diabetes10 age / clodds=wald;
units age=5 10 15 20;
run;

等级变量logistic回归

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
proc logistic data=library.rn2;
class RIAGENDR (ref='Male') RaceEthnicity (ref='Non-Hispanic White')
Education (ref='Graduated from College') smoking (ref="Never smoked (< 100 cigarettes)")
Physical_activity (ref='Did not do vigorous or moderate activity') / PARAM=ref;
model HSD010 (order=internal descending) = RIAGENDR Exactage RaceEthnicity Education
Smoking Physical_activity / expb;
* "order=internal" for following the order of values of HSD010 (First, "Excellent",
then "Very Good", ...").;
* "descending" for [from poor to excellent] instead of [from excellent to poor]
so probabilities and odds of being less healthy are computed;
run;

* By default, proc logistic will model the lowest value of the outcome variable.
That means it models NO disease, unless use the option "desc" which means to flip the ordering. ;
proc logistic data = new desc;
*class exposure;
model esophagealC=exposure;
run;

* For a binary predictor, it's not necessary, but for a categorical with 3 or more levels, you need to declare it as a "class" variable;
proc logistic data =new desc;
class exposure12;
model esophagealC=exposure12;
run;

多分无序变量logistic回归

1
2
3
4
5
6
7
8
9
10
11
proc logistic data=work.rn2a;
class BMI3 (ref="20.00-29.99") RIAGENDR (ref='Male') Agegroup (ref="25-29")
RaceEthnicity (ref='Non-Hispanic White') Education (ref='Graduated from College')
smoking (ref="Never smoked (< 100 cigarettes)")
Physical_activity (ref='Did not do vigorous or moderate activity')
/ PARAM=ref;
model BMI3 = RIAGENDR Agegroup RaceEthnicity Education Smoking Physical_activity
/ expb link=glogit;
*"link" gives choices among logit, probit, cloglog and glogit (generalized logit model
for polytomous/multinomial logistic regression);
run;

EMM & Interaction的两种方法

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
/* Step 3. Logistic regression of hyperglycemia on gender, age, race/ethnicity, education, 
income and physical activity with gender-education interaction. */
proc logistic data=work.rn2a;
class RIAGENDR (ref='Male') Agegroup (ref='25-29') RaceEthnicity (ref='Non-Hispanic White') Education
(ref='Graduated from College') PIRgroup (ref='5.00+') Physical_activity (ref='Did vigorous activity')
/ PARAM=ref;
model hyperglycemia = RIAGENDR Agegroup RaceEthnicity Education PIRgroup Physical_activity RIAGENDR*Education / expb;
run;
-------------------------
data work.rn2a; set work.rn2a;
gen_edu = .;
if (RIAGENDR=1) and (Education=1) then gen_edu=1;
if (RIAGENDR=1) and (Education=2) then gen_edu=2;
if (RIAGENDR=1) and (Education=3) then gen_edu=3;
if (RIAGENDR=2) and (Education=1) then gen_edu=4;
if (RIAGENDR=2) and (Education=2) then gen_edu=5;
if (RIAGENDR=2) and (Education=3) then gen_edu=6;
label gen_edu="Gender x Education";
format gen_edu GEN_EDU.;
run;

/* Step A2. Logistic regression of hyperglycemia on age, race/ethnicity, income,
physical activity, and the new variable gen_edu, for comparison with results of step 3. */
proc logistic data=work.rn2a;
class RIAGENDR (ref='Male') Agegroup (ref='25-29') RaceEthnicity (ref='Non-Hispanic White') Education
(ref='Graduated from College') PIRgroup (ref='5.00+') Physical_activity (ref='Did vigorous activity')
gen_edu (ref = 'M, College') / PARAM=ref;
model hyperglycemia = Agegroup RaceEthnicity PIRgroup Physical_activity gen_edu / expb;
run;

Survival Analysis

Kaplan Meier survival analysis

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
proc lifetest data=lab1 plots=(S) method=km;
time permth_int*death(0);
title 'Kaplan Meier time to all cause mortality';
run;

proc lifetest data=lab1 plots=(S) method=km;
time permth_int*death(0);
where age>=65;
strata sex;
title 'KM of time to all cause mortality by gender among those 65+';
run;

* If you want to add 95% confidence bands (using Hall-Wellner method) to your plots you can do so with the following syntax;
proc lifetest data=lab1 plots=survival(cb=hw test) method=km;
time permth_int*death(0);
where age>=65;
strata sex;
title 'KM of time to all cause mortality by gender among those 65+';
run;

Life table survival analysis

1
2
3
4
proc lifetest data=lab1 plots=(S) method=life width=6;
time permth_int*death(0);
title 'Lifetable time to all cause mortality';
run;

Proportionsal Hazard Cox Model

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
**Hazard ratio;
* 1) You can code exposed as 1 and unexposed as 0 or 2, but diseased should be coded as 1 and non-diseased as 0.
2) SAS orders variables from lower to higher values. If your exposure is coded 0/1 or 1/2, by default,
SAS will use the last category as the reference. If it is coded differently, we can used the REF
keyword and assign our reference (FIRST or LAST);

*Gender only;
proc phreg data =new;
class gender (ref=first);
model lenfol*fstat(0) = gender / risklimits ;
run;

*Gender and age;
proc phreg data = new;
class gender (ref=first);;
model lenfol*fstat(0) = gender age / risklimits;
run;

proc phreg data=lab4;
model stime*stroke(0)=age;
strata diabetes10;
baseline out=temp loglogs=lls;
run;
----------

** Example with person-time;
data cohort2;
input exposed outcome count PT;
datalines ;
1 1 25 105
1 2 175 875
2 1 10 24
2 2 190 950
;
run;

proc freq data=cohort;
tables exposed*outcome /relrisk riskdiff;
weight count;
run;

proc phreg data=cohort;
class exposed;
model PT*outcome(2)= exposed/risklimits;
weight count;
run;

*** Using the Assess PH test;
proc phreg data=lab4;
model stime*stroke(0)=age diabetes10 / rl; /* rl:risk limit;*/
assess ph/resample;
run;

surveyphreg (weighting)

1
2
3
4
5
6
proc surveyphreg data=lab4;
strata strata;
cluster psu;
weight weight;
model stime*stroke(0)= age diabetes10 /rl;
run;

Survival Plot

1
2
3
4
5
proc gplot data=temp;
plot lls*stime=age;
symbol1 interpol=join color=blue line=1 ;
symbol2 interpol=join color=red line=1 ;
run;

Power and Sample size

Simple sample size calculation

1
2
3
4
5
6
7
8
proc power;
twosamplefreq
sides = 2
groupproportions = (.02 .057692)
power = .8
alpha = .05
npergroup=.;
run;

using a Fisher’s exact test;

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
proc power;
twosamplefreq
test=fisher
sides = 2
groupproportions = (.02 .057692)
power = .8
alpha = .05
npergroup=.;
run;


proc power;
twosamplefreq
test=fisher
proportiondiff=0.03 to 0.10 by 0.01
refproportion=.02
power = .8
alpha = .05
npergroup=.;
run;

Simple power calculation

1
2
3
4
5
6
7
8
9
proc power;
twosamplefreq
test=fisher
sides = 2
groupproportions = (.02 .057692)
power = .
alpha = .05
npergroup=500;
run;

A2. Comparing means;

1
2
3
4
5
6
7
8
9
10
11
proc power;
twosamplemeans
test = diff
dist = normal
alpha = .05
groupmeans = (8 12)
stddev = 5
power = .8
groupweights = 1|1
ntotal = .;
run;

A3. Single proportion;

1
2
3
4
5
6
7
8
9
10
11
proc power;
onesamplefreq
test=Z
method=normal
sides = 2
nullproportion = .2
proportion = .3
power = .8
alpha = .05
ntotal = .;
run;

Single means;

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
proc power;
onesamplemeans
test = t
sides=2
nullm=22
mean=24
stddev=3
power=0.8
alpha=0.05
ntotal=.;
run;

proc power;
onesamplemeans
test = t
sides=1
nullm=22
mean=24
stddev=3
power=0.8
alpha=0.05
ntotal=.;
run;

Testing

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
** Exploring what happens to the ratio;
** If we hold all else constant, which ratio gives us the most power?;

proc power;
twosamplefreq
test=fisher
sides=2
groupproportions = (.2 .4)
power = .
alpha = .05
ntotal=500
groupweights = 1 2 3 4 5 6 7 8 9 10 | 1;
run;

*** What happens to our power as we increase our sample size by increasing our case to control ratio?;

proc power;
twosamplefreq
test=fisher
sides=2
groupproportions = (.2 .3)
power = .
alpha = .05
groupns = (200 200) (400 200) (600 200) (800 200) (1000 200) (1200 200) (1400 200) (1600 200) (1800 200) (2000 200);
run;

随机与分组

Random number

1
2
3
4
5
6
7
8
9
10
11
12
13
data randomnumbers;
do n=1 to 200 by 1;
group = int(ranuni(0)*2+1);
output;
end;
run;

proc print data=randomnumbers;
run;

proc freq data=randomnumbers;
table group;
run;

Random Grouping

1
2
3
4
5
6
7
8
*** First 100 mentions are in group 1 / Second 100 mentions are in group 2;
proc plan;
factors id=200 random;
output out=random200;
run;

proc print data=random200;
run;

Block Randomization

1
2
3
4
5
6
7
8
9
10
***  Block randomization;
*** You can also use "proc plan" for block randomization;
*** Suppose I want to ensure more or less equal groups throughout the study;
*** I could do 20 blocks of 10;
*** Within each block the first five mentions would be group 1 and the second five group 2;

proc plan;
factors block = 20 ordered treat = 10 random ;
output out=randblock ;
run;

proc survey select

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
*** C. To obtain a random subsample from a dataset use proc surveyselect;
*** Note, srs means simple random sample without replacement;

libname epi2 '\\tsclient\USB Disk\Epi 2\';

*** If using SAS on my desktop;

libname epi2 'F:\Epi 2';

proc surveyselect data=epi2.class
out = subset
method=srs
sampsize=4;
run;

proc freq data=subset;
tables name;
run;

数字格式

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
data x;
input X1 X2 X3;
datalines;
1 3 0
2 4 0
3 2 0
4 2 0
5 4 0
6 2 0
7 3 0
8 5 0
;
run;

data x(Drop = X2);
set x;
X3 = X1;
format X3 z9.;
run;
data x2;
set x2;
X2 = put(X1,z9.);
run;

data x3;
set x2;
X4 = X3;
X5 = input(X2, 9.);
run;

data x4(Drop = X2 X3);
set x;
run;
data x4;
set x4;
X2 = put(X1, z9.);
run;
data x4;
set x4;
X3 = input(X2, 9.);
run;

/*
best666.
z9.
*/

proc univariate

1
2
3
4
5
6
7
8
9
10
11
12
13
14
proc univariate data = wasiraw;
var wasi_fsiq2_comp;
output out = dataset pctlpts = 5 95 * save 5% 95% percentile
pctlpre = Autoname;
run;

ods csv file = "******\wasi\total_IQ.csv";
proc print data = wasiraw noobs label;
/*NOOBS - 禁止输出中按列号标识每个观察的列 LABEL - 使用变量'标签作为列标题*/
var wasi_subid wasi_fsiq2_comp;
where wasi_fsiq2_comp < 0 or wasi_fsiq2_comp > 0; /*change 0 to the previous output*/
run;
options nodate nonumber;
ods csv close;

Author:Ruoyan Han

Link of This Article:https://ry2an.github.io/2020/08/15/SAS-codes/

Publish Date:August 15th 2020, 9:28:27 pm

Update Date:August 16th 2020, 5:58:21 pm

Copyright Notice:This article is licensed under the Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)

CATALOG
  1. 1. Data Importing
    1. 1.1. 连接路径设置library
    2. 1.2. 建立新数据集并设置一些变量
    3. 1.3. 建立手动输入的数据表
    4. 1.4. 导入csv
    5. 1.5. 导入xls
    6. 1.6. 导出csv
    7. 1.7. 导出xls
    8. 1.8. 显示数据集 print
    9. 1.9. 禁用format
    10. 1.10. 建立table
    11. 1.11. 建立一种format
    12. 1.12. 为数据集使用format
    13. 1.13. 修改变量label
    14. 1.14. 时间变量读取year hour
    15. 1.15. 在有比重的表格中使用比重
    16. 1.16. 在逻辑回归中使用抽样的权重 surveylogistic weight
  2. 2. Basic Data Summary
    1. 2.1. 显示变量
    2. 2.2. 显示头几行数据 head()
    3. 2.3. 排序数据 sort
    4. 2.4. 写出排名 proc rank
    5. 2.5. 读取上面一格和下面一格的值
    6. 2.6. SQL 嵌套
  3. 3. Descriptive Statistics
    1. 3.1. 显示变量频数 (不要用于连续型变量)(统计缺失值数量)(stratified table)
    2. 3.2. 显示连续型变量的最小值最大值均值标准差中位数百分位数
    3. 3.3. 显示均值meas
    4. 3.4. 筛选数据
    5. 3.5. 计算OR RR RD
    6. 3.6. Odds ratio (logistic regression)
    7. 3.7. Risk Ratio
    8. 3.8. 卡方检验
    9. 3.9. 所有变量correlation
    10. 3.10. Cronbach Alpha
    11. 3.11. Pearson and Spearman correlations;
  4. 4. Plotting
    1. 4.1. Xy散点图 (4 methods)
    2. 4.2. Plot Curve
    3. 4.3. 条形图 sgplot
    4. 4.4. Histogram (check whether normal)
    5. 4.5. Histogram
  5. 5. Regression
    1. 5.1. 线性回归
    2. 5.2. 线性回归并计算残差和回归均方
    3. 5.3. 线性回归检测分类变量设定
    4. 5.4. General Linear Model
    5. 5.5. 残差分布
  6. 6. Univariate Analysis
    1. 6.1. 检测是否正态(qq-plot)
    2. 6.2. 单样本t检验
    3. 6.3. 配对样本t检验
    4. 6.4. 两样本t检验
    5. 6.5. Fisher’s exact test
    6. 6.6. Wilcoxon rank sum/Mann-Whitney test
    7. 6.7. Kruskal-Wallis test
    8. 6.8. One way anova
  7. 7. Logistic Regression
    1. 7.1. Logistic regression set reference
    2. 7.2. 非线性回归并对categorical自变量的β值进行方差检验
    3. 7.3. 保存逻辑回归的预测值 surveylogistic
    4. 7.4. Logistic回归并储存结果
  8. 8. 1 >model hyperglycemia = RIAGENDR Agegroup RaceEthnicity Education PIRgroup / expb;
    1. 8.1. logistic 回归中变连续变量为有序变量
    2. 8.2. 等级变量logistic回归
    3. 8.3. 多分无序变量logistic回归
    4. 8.4. EMM & Interaction的两种方法
  9. 9. Survival Analysis
    1. 9.1. Kaplan Meier survival analysis
    2. 9.2. Life table survival analysis
    3. 9.3. Proportionsal Hazard Cox Model
    4. 9.4. surveyphreg (weighting)
    5. 9.5. Survival Plot
  10. 10. Power and Sample size
    1. 10.1. Simple sample size calculation
    2. 10.2. using a Fisher’s exact test;
    3. 10.3. Simple power calculation
    4. 10.4. A2. Comparing means;
    5. 10.5. A3. Single proportion;
    6. 10.6. Single means;
    7. 10.7. Testing
  11. 11. 随机与分组
    1. 11.1. Random number
    2. 11.2. Random Grouping
    3. 11.3. Block Randomization
    4. 11.4. proc survey select
  12. 12. 数字格式
  13. 13. proc univariate