Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions clang/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,10 @@ Improvements to Clang's diagnostics
- ``-Wreserved-identifier`` now fires on reserved parameter names in a function
declaration which is not a definition.

- A new ``-Wimplicit-unicode-conversion`` warns where comparing or implicitly converting
between different Unicode character types (``char8_t``, ``char16_t``, ``char32_t``).
This warning only triggers in C++ as these types are aliases in C. (#GH138526)

Improvements to Clang's time-trace
----------------------------------

Expand Down
3 changes: 3 additions & 0 deletions clang/include/clang/AST/ASTDiagnostic.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ namespace clang {
/// is initialized before passing it in.
QualType desugarForDiagnostic(ASTContext &Context, QualType QT,
bool &ShouldAKA);

std::string FormatUTFCodeUnitAsCodepoint(unsigned Value, QualType T);

} // end namespace clang

#endif
1 change: 1 addition & 0 deletions clang/include/clang/AST/Type.h
Original file line number Diff line number Diff line change
Expand Up @@ -2524,6 +2524,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
bool isChar16Type() const;
bool isChar32Type() const;
bool isAnyCharacterType() const;
bool isUnicodeCharacterType() const;
bool isIntegralType(const ASTContext &Ctx) const;

/// Determine whether this type is an integral or enumeration type.
Expand Down
1 change: 1 addition & 0 deletions clang/include/clang/Basic/DiagnosticGroups.td
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ def EnumConversion : DiagGroup<"enum-conversion",
ImplicitEnumEnumCast,
EnumFloatConversion,
EnumCompareConditional]>;
def ImplicitUnicodeConversion : DiagGroup<"implicit-unicode-conversion">;
def DeprecatedOFast : DiagGroup<"deprecated-ofast">;
def ObjCSignedCharBoolImplicitIntConversion :
DiagGroup<"objc-signed-char-bool-implicit-int-conversion">;
Expand Down
28 changes: 28 additions & 0 deletions clang/include/clang/Basic/DiagnosticSemaKinds.td
Original file line number Diff line number Diff line change
Expand Up @@ -4357,6 +4357,29 @@ def warn_address_of_reference_bool_conversion : Warning<
"code; pointer may be assumed to always convert to true">,
InGroup<UndefinedBoolConversion>;

def warn_impcast_unicode_char_type
: Warning<"implicit conversion from %0 to %1 may change the meaning of the "
"represented code unit">,
InGroup<ImplicitUnicodeConversion>;
def warn_impcast_unicode_precision
: Warning<"implicit conversion from %0 to %1 may lose precision and change "
"the meaning of the represented code unit">,
InGroup<ImplicitUnicodeConversion>;
def warn_impcast_unicode_char_type_constant
: Warning<"implicit conversion from %0 to %1 changes the meaning of the "
"%select{code unit|codepoint}2 '%3'">,
InGroup<ImplicitUnicodeConversion>;

def warn_comparison_unicode_mixed_types
: Warning<"comparing values of different Unicode code unit types %0 and %1 "
"may compare different codepoints">,
InGroup<ImplicitUnicodeConversion>;

def warn_comparison_unicode_mixed_types_constant
: Warning<"comparing values of different Unicode code unit types %0 and %1 "
"compares unrelated code units '%2' and '%3'">,
InGroup<ImplicitUnicodeConversion>;

def warn_xor_used_as_pow : Warning<
"result of '%0' is %1; did you mean exponentiation?">,
InGroup<XorUsedAsPow>;
Expand Down Expand Up @@ -7719,6 +7742,11 @@ def warn_comparison_of_mixed_enum_types_switch : Warning<
"%diff{ ($ and $)|}0,1">,
InGroup<EnumCompareSwitch>;

def warn_arith_conv_mixed__unicode_types
: Warning<"%sub{select_arith_conv_kind}0 "
"different Unicode character types %1 and %2">,
InGroup<ImplicitUnicodeConversion>;

def err_typecheck_assign_const : Error<
"%select{"
"cannot assign to return value because function %1 returns a const value|"
Expand Down
29 changes: 29 additions & 0 deletions clang/lib/AST/ASTDiagnostic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
#include "clang/AST/TemplateBase.h"
#include "clang/AST/Type.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/Format.h"
#include "llvm/Support/raw_ostream.h"

using namespace clang;
Expand Down Expand Up @@ -2190,3 +2192,30 @@ static bool FormatTemplateTypeDiff(ASTContext &Context, QualType FromType,
TD.DiffTemplate();
return TD.Emit();
}

std::string clang::FormatUTFCodeUnitAsCodepoint(unsigned Value, QualType T) {
auto IsSingleCodeUnitCP = [](unsigned Value, QualType T) {
if (T->isChar8Type()) {
assert(Value <= 0xFF && "not a valid UTF-8 code unit");
return Value <= 0x7F;
}
if (T->isChar16Type()) {
assert(Value <= 0xFFFF && "not a valid UTF-16 code unit");
return llvm::IsSingleCodeUnitUTF16Codepoint(Value);
}
return llvm::IsSingleCodeUnitUTF32Codepoint(Value);
};
llvm::SmallVector<char, 4> Str;
if (!IsSingleCodeUnitCP(Value, T)) {
llvm::raw_svector_ostream OS(Str);
OS << "<" << llvm::format_hex(Value, 1, /*Upper=*/true) << ">";
return std::string(Str.begin(), Str.end());
}

char Buffer[UNI_MAX_UTF8_BYTES_PER_CODE_POINT];
char *Ptr = Buffer;
[[maybe_unused]] bool Converted = llvm::ConvertCodePointToUTF8(Value, Ptr);
assert(Converted && "trying to encode invalid code unit");
EscapeStringForDiagnostic(StringRef(Buffer, Ptr - Buffer), Str);
return std::string(Str.begin(), Str.end());
}
14 changes: 14 additions & 0 deletions clang/lib/AST/Type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2193,6 +2193,20 @@ bool Type::isAnyCharacterType() const {
}
}

bool Type::isUnicodeCharacterType() const {
const auto *BT = dyn_cast<BuiltinType>(CanonicalType);
if (!BT)
return false;
switch (BT->getKind()) {
default:
return false;
case BuiltinType::Char8:
case BuiltinType::Char16:
case BuiltinType::Char32:
return true;
}
}

/// isSignedIntegerType - Return true if this is an integer type that is
/// signed, according to C99 6.2.5p4 [char, signed char, short, int, long..],
/// an enum decl which has a signed representation
Expand Down
46 changes: 46 additions & 0 deletions clang/lib/Sema/SemaChecking.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "CheckExprLifetime.h"
#include "clang/AST/APValue.h"
#include "clang/AST/ASTContext.h"
#include "clang/AST/ASTDiagnostic.h"
#include "clang/AST/Attr.h"
#include "clang/AST/AttrIterator.h"
#include "clang/AST/CharUnits.h"
Expand Down Expand Up @@ -11810,6 +11811,46 @@ static void DiagnoseIntInBoolContext(Sema &S, Expr *E) {
}
}

static void DiagnoseMixedUnicodeImplicitConversion(Sema &S, const Type *Source,
const Type *Target, Expr *E,
QualType T,
SourceLocation CC) {
assert(Source->isUnicodeCharacterType() && Target->isUnicodeCharacterType() &&
Source != Target);
Expr::EvalResult Result;
if (E->EvaluateAsInt(Result, S.getASTContext(), Expr::SE_AllowSideEffects,
S.isConstantEvaluatedContext())) {
llvm::APSInt Value(32);
Value = Result.Val.getInt();
bool IsASCII = Value <= 0x7F;
bool IsBMP = Value <= 0xD7FF || (Value >= 0xE000 && Value <= 0xFFFF);
bool ConversionPreservesSemantics =
IsASCII || (!Source->isChar8Type() && !Target->isChar8Type() && IsBMP);

if (!ConversionPreservesSemantics) {
auto IsSingleCodeUnitCP = [](const QualType &T,
const llvm::APSInt &Value) {
if (T->isChar8Type())
return llvm::IsSingleCodeUnitUTF8Codepoint(Value.getExtValue());
if (T->isChar16Type())
return llvm::IsSingleCodeUnitUTF16Codepoint(Value.getExtValue());
return llvm::IsSingleCodeUnitUTF32Codepoint(Value.getExtValue());
};

S.Diag(CC, diag::warn_impcast_unicode_char_type_constant)
<< E->getType() << T
<< IsSingleCodeUnitCP(E->getType().getUnqualifiedType(), Value)
<< FormatUTFCodeUnitAsCodepoint(Value.getExtValue(), E->getType());
}
} else {
bool LosesPrecision = S.getASTContext().getIntWidth(E->getType()) >
S.getASTContext().getIntWidth(T);
DiagnoseImpCast(S, E, T, CC,
LosesPrecision ? diag::warn_impcast_unicode_precision
: diag::warn_impcast_unicode_char_type);
}
}

void Sema::CheckImplicitConversion(Expr *E, QualType T, SourceLocation CC,
bool *ICContext, bool IsListInit) {
if (E->isTypeDependent() || E->isValueDependent()) return;
Expand Down Expand Up @@ -12147,6 +12188,11 @@ void Sema::CheckImplicitConversion(Expr *E, QualType T, SourceLocation CC,

DiscardMisalignedMemberAddress(Target, E);

if (Source->isUnicodeCharacterType() && Target->isUnicodeCharacterType()) {
DiagnoseMixedUnicodeImplicitConversion(*this, Source, Target, E, T, CC);
return;
}

if (Target->isBooleanType())
DiagnoseIntInBoolContext(*this, E);

Expand Down
67 changes: 67 additions & 0 deletions clang/lib/Sema/SemaExpr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "UsedDeclVisitor.h"
#include "clang/AST/ASTConsumer.h"
#include "clang/AST/ASTContext.h"
#include "clang/AST/ASTDiagnostic.h"
#include "clang/AST/ASTLambda.h"
#include "clang/AST/ASTMutationListener.h"
#include "clang/AST/CXXInheritance.h"
Expand Down Expand Up @@ -1567,15 +1568,81 @@ void Sema::checkEnumArithmeticConversions(Expr *LHS, Expr *RHS,
}
}

static void CheckUnicodeArithmeticConversions(Sema &SemaRef, Expr *LHS,
Expr *RHS, SourceLocation Loc,
ArithConvKind ACK) {
QualType LHSType = LHS->getType().getUnqualifiedType();
QualType RHSType = RHS->getType().getUnqualifiedType();

if (!SemaRef.getLangOpts().CPlusPlus || !LHSType->isUnicodeCharacterType() ||
!RHSType->isUnicodeCharacterType())
return;

if (ACK == ArithConvKind::Comparison) {
if (SemaRef.getASTContext().hasSameType(LHSType, RHSType))
return;

Expr::EvalResult LHSRes, RHSRes;
bool Success = LHS->EvaluateAsInt(LHSRes, SemaRef.getASTContext(),
Expr::SE_AllowSideEffects,
SemaRef.isConstantEvaluatedContext());
if (Success)
Success = RHS->EvaluateAsInt(RHSRes, SemaRef.getASTContext(),
Expr::SE_AllowSideEffects,
SemaRef.isConstantEvaluatedContext());
if (Success) {
llvm::APSInt LHSValue(32);
LHSValue = LHSRes.Val.getInt();
llvm::APSInt RHSValue(32);
RHSValue = RHSRes.Val.getInt();

auto IsSingleCodeUnitCP = [](const QualType &T,
const llvm::APSInt &Value) {
if (T->isChar8Type())
return llvm::IsSingleCodeUnitUTF8Codepoint(Value.getExtValue());
if (T->isChar16Type())
return llvm::IsSingleCodeUnitUTF16Codepoint(Value.getExtValue());
return llvm::IsSingleCodeUnitUTF32Codepoint(Value.getExtValue());
};

bool LHSSafe = IsSingleCodeUnitCP(LHSType, LHSValue);
bool RHSSafe = IsSingleCodeUnitCP(RHSType, RHSValue);
if (LHSSafe && RHSSafe)
return;

SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types_constant)
<< LHS->getSourceRange() << RHS->getSourceRange() << LHSType
<< RHSType
<< FormatUTFCodeUnitAsCodepoint(LHSValue.getExtValue(), LHSType)
<< FormatUTFCodeUnitAsCodepoint(RHSValue.getExtValue(), RHSType);
return;
}
SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types)
<< LHS->getSourceRange() << RHS->getSourceRange() << LHSType << RHSType;
return;
}

if (SemaRef.getASTContext().hasSameType(LHSType, RHSType))
return;

SemaRef.Diag(Loc, diag::warn_arith_conv_mixed__unicode_types)
<< LHS->getSourceRange() << RHS->getSourceRange() << ACK << LHSType
<< RHSType;
return;
}

/// UsualArithmeticConversions - Performs various conversions that are common to
/// binary operators (C99 6.3.1.8). If both operands aren't arithmetic, this
/// routine returns the first non-arithmetic type found. The client is
/// responsible for emitting appropriate error diagnostics.
QualType Sema::UsualArithmeticConversions(ExprResult &LHS, ExprResult &RHS,
SourceLocation Loc,
ArithConvKind ACK) {

checkEnumArithmeticConversions(LHS.get(), RHS.get(), Loc, ACK);

CheckUnicodeArithmeticConversions(*this, LHS.get(), RHS.get(), Loc, ACK);

if (ACK != ArithConvKind::CompAssign) {
LHS = UsualUnaryConversions(LHS.get());
if (LHS.isInvalid())
Expand Down
Loading